diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index ece5449d6..38e4c56ce 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -32,6 +32,7 @@ #include "binbcast.hpp" #include "argmax.hpp" #include "argsort.hpp" +#include "cpy.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp new file mode 100644 index 000000000..061fc0684 --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -0,0 +1,389 @@ +#include "cpy.hpp" + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = sycl::vec(*xi).convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i16_i16(const char * cxi, char * cdsti) { + const int16_t * xi = (const int16_t *) cxi; + int16_t * dsti = (int16_t *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t * xi = (const int32_t *) cxi; + int32_t * dsti = (int32_t *) cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float) v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j] * id; + + dsti->qs[j] = sycl::round((float) x0); + } +} + +static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) { + vmin = v; + } + if (v > vmax) { + vmax = v; + } + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1 / 2; ++j) { + const float x0 = (xi[0 + j] - vmin) * id; + const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; + + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + queue_ptr main_stream = ctx.stream(); + + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { + ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else { + GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp new file mode 100644 index 000000000..6be62076f --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -0,0 +1,11 @@ +#ifndef GGML_SYCL_CPY_HPP +#define GGML_SYCL_CPY_HPP + +#include "common.hpp" +#include + +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); + +#endif // GGML_SYCL_CPY_HPP \ No newline at end of file diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 803ea6c23..c147b6ec0 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1265,8 +1265,6 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(q // struct ggml_sycl_pool_vmm : public ggml_sycl_pool /// kernels - -typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -1525,193 +1523,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous } } -static void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = sycl::vec(*xi) - .convert()[0]; -} - -static void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_i16_i16(const char * cxi, char * cdsti) { - const int16_t *xi = (const int16_t *)cxi; - int16_t *dsti = (int16_t *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_i32_i32(const char * cxi, char * cdsti) { - const int32_t *xi = (const int32_t *)cxi; - int32_t *dsti = (int32_t *)cdsti; - - *dsti = *xi; -} - -template -static void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= ne) { - return; - } - - // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor - // then combine those indices with the corresponding byte offsets to get the total offsets - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; - - cpy_1(cx + x_offset, cdst + dst_offset); -} - -static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = sycl::fmax(amax, sycl::fabs((float)v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j]*id; - - dsti->qs[j] = sycl::round((float)x0); - } -} - -static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float)v)) { - amax = sycl::fabs((float)v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_0/2 + j]*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) vmin = v; - if (v > vmax) vmax = v; - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = vmin; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (xi[0 + j] - vmin)*id; - const float x1 = (xi[QK4_1/2 + j] - vmin)*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -template -static void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)) * - qk; - - if (i >= ne) { - return; - } - - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(1); @@ -1970,232 +1781,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void -ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, - const int ne01, const int ne02, const int nb00, - const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, - const int nb10, const int nb11, const int nb12, - const int nb13, queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, - nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK8_0 == 0); - const int num_blocks = ne / QK8_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_0 == 0); - const int num_blocks = ne / QK4_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_1 == 0); - const int num_blocks = ne / QK4_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - static void scale_f32_sycl(const float *x, float *dst, const float scale, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; @@ -3550,51 +3135,6 @@ static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_clamp(ctx, dst); } -static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne == ggml_nelements(src1)); - - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); - GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - - GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; - - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - queue_ptr main_stream = ctx.stream(); - - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else { - GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, - ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { // TODO: why do we pass dst as src1 here? ggml_sycl_cpy(ctx, dst->src[0], dst);