remove useless backend check
This commit is contained in:
parent
d342abca57
commit
996b35a0ad
1 changed files with 21 additions and 99 deletions
120
ggml-sycl.cpp
120
ggml-sycl.cpp
|
@ -10774,14 +10774,10 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = nullptr;
|
||||
float * src1_ddf = nullptr;
|
||||
float * dst_ddf = nullptr;
|
||||
float * src0_ddf = (float *) src0->data;
|
||||
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
|
||||
|
@ -10792,48 +10788,8 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
// GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
|
||||
// ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
|
||||
|
||||
if (src0_on_device) {
|
||||
src0_ddf = (float *) src0_extra->data_device[ctx.device];
|
||||
} else {
|
||||
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
||||
// GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
||||
}
|
||||
|
||||
if (use_src1) {
|
||||
if (src1_on_device) {
|
||||
src1_ddf = (float *) src1_extra->data_device[ctx.device];
|
||||
} else {
|
||||
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
||||
}
|
||||
}
|
||||
if (dst_on_device) {
|
||||
dst_ddf = (float *) dst_extra->data_device[ctx.device];
|
||||
} else {
|
||||
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
||||
}
|
||||
|
||||
// GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
|
||||
// src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||
// do the computation
|
||||
op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||
/*
|
||||
DPCT1010:89: SYCL uses exceptions to report errors and does not use the
|
||||
error codes. The call was replaced with 0. You need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
|
||||
// copy dst to host if necessary
|
||||
if (!dst_on_device) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)).wait()));
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
}
|
||||
// print_ggml_tensor("tensor", dst);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
@ -10923,7 +10879,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||
|
||||
|
@ -10995,20 +10950,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
|
||||
const bool src1_on_device = i == ctx.device;
|
||||
const bool dst_on_device = i == ctx.device;
|
||||
|
||||
ggml_sycl_set_device(i);
|
||||
queue_ptr stream = ctx.stream(i, 0);
|
||||
|
||||
if (src0_on_device && src0_is_contiguous) {
|
||||
dev[i].src0_dd = (char *) src0_extra->data_device[i];
|
||||
if (src0_is_contiguous) {
|
||||
dev[i].src0_dd = (char *) src0->data;
|
||||
} else {
|
||||
dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
|
||||
}
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
dev[i].src1_ddf = (float *) src1_extra->data_device[i];
|
||||
dev[i].src1_ddf = (float *) src1->data;
|
||||
} else {
|
||||
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
|
||||
}
|
||||
|
@ -11028,7 +10983,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
|
||||
if (dst_on_device) {
|
||||
dev[i].dst_dd = (float *) dst_extra->data_device[i];
|
||||
dev[i].dst_dd = (float *) dst->data;
|
||||
} else {
|
||||
const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
|
||||
dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
|
||||
|
@ -11059,8 +11014,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
continue;
|
||||
}
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
|
||||
const bool src1_on_device = i == ctx.device;
|
||||
const bool dst_on_device = i == ctx.device;
|
||||
const int64_t row_diff = dev[i].row_high - dev[i].row_low;
|
||||
|
||||
ggml_sycl_set_device(i);
|
||||
|
@ -11091,12 +11046,12 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
|
||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||
// in that case an offset on dst_ddf_i is needed
|
||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device) {
|
||||
if (i == ctx.device) {
|
||||
dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||
if (src1_is_contiguous) {
|
||||
if (i != ctx.device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
||||
|
@ -11114,14 +11069,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
} else if (src1_on_device && !src1_is_contiguous) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
|
@ -11131,7 +11086,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
SYCL_CHECK(0);
|
||||
}
|
||||
|
||||
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
|
||||
}
|
||||
if (src1->type == GGML_TYPE_F16) {
|
||||
|
@ -11149,17 +11104,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
|
||||
// copy dst to host or other device if necessary
|
||||
if (!dst_on_device) {
|
||||
void * dst_off_device;
|
||||
dpct::memcpy_direction kind;
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
dst_off_device = dst->data;
|
||||
kind = dpct::device_to_host;
|
||||
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
dst_off_device = dst_extra->data_device[ctx.device];
|
||||
kind = dpct::device_to_device;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
void * dst_off_device = dst->data;
|
||||
if (split) {
|
||||
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
||||
// dst is NOT transposed.
|
||||
|
@ -11170,27 +11115,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||
dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
|
||||
|
||||
//todo, dirty solution. Need be updated when device2device memcpy() is supported.
|
||||
if (kind == dpct::device_to_device) {
|
||||
size_t dst_size = ggml_nbytes_pad(dst);
|
||||
float *host_buf = (float *)malloc(dst_size);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
|
||||
host_buf, ne0 * sizeof(float), dst_dd_i,
|
||||
row_diff * sizeof(float), row_diff * sizeof(float),
|
||||
src1_ncols, dpct::device_to_host, *stream)));
|
||||
dpct::dev_mgr::instance().get_device(i).queues_wait_and_throw();
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
|
||||
dhf_dst_i, ne0 * sizeof(float), host_buf,
|
||||
row_diff * sizeof(float), row_diff * sizeof(float),
|
||||
src1_ncols, dpct::host_to_device, *main_stream)));
|
||||
dpct::dev_mgr::instance().get_device(ctx.device).queues_wait_and_throw();
|
||||
free(host_buf);
|
||||
} else {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
|
||||
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
|
||||
row_diff * sizeof(float), row_diff * sizeof(float),
|
||||
src1_ncols, kind, *stream)));
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
|
||||
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
|
||||
row_diff * sizeof(float), row_diff * sizeof(float),
|
||||
src1_ncols, dpct::device_to_device, *stream)));
|
||||
} else {
|
||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||
|
@ -11234,12 +11162,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue