remove duplicate extra and global work group size
This commit is contained in:
parent
9c5476ead4
commit
224273e0dd
1 changed files with 14 additions and 29 deletions
|
@ -6219,7 +6219,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
const int work_group_size = g_work_group_size;
|
// FIXME: 1024 from cuda
|
||||||
|
const int work_group_size = 1024;
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
/*
|
/*
|
||||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
@ -6265,7 +6266,7 @@ static void group_norm_f32_sycl(const float *x, float *dst,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
const int work_group_size = g_work_group_size;
|
const int work_group_size = 1024;
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
/*
|
/*
|
||||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
@ -6354,7 +6355,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
const int work_group_size = g_work_group_size;
|
const int work_group_size = 1024;
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
/*
|
/*
|
||||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
@ -9188,7 +9189,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
|
||||||
const int nrows_y, const float scale, const float max_bias,
|
const int nrows_y, const float scale, const float max_bias,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
int nth = WARP_SIZE;
|
int nth = WARP_SIZE;
|
||||||
int max_block_size = g_work_group_size;
|
int max_block_size = 1024;
|
||||||
while (nth < ncols_x && nth < max_block_size) nth *= 2;
|
while (nth < ncols_x && nth < max_block_size) nth *= 2;
|
||||||
if (nth>max_block_size) nth = max_block_size;
|
if (nth>max_block_size) nth = max_block_size;
|
||||||
|
|
||||||
|
@ -11393,14 +11394,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
void * src0_ddq = src0->data;
|
||||||
void * src0_ddq = src0_extra->data_device[ctx.device];
|
float * src1_ddf = (float *) src1->data;
|
||||||
|
float * dst_ddf = (float *) dst->data;
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
|
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
|
|
||||||
|
|
||||||
ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
||||||
}
|
}
|
||||||
|
@ -11431,15 +11427,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
void * src0_ddq = src0->data;
|
||||||
void * src0_ddq = src0_extra->data_device[ctx.device];
|
float * src1_ddf = (float *) src1->data;
|
||||||
|
float * dst_ddf = (float *) dst->data;
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
|
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
|
|
||||||
|
|
||||||
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
|
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
|
||||||
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
|
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
|
||||||
|
@ -11983,9 +11974,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
|
||||||
|
@ -11994,11 +11982,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
|
||||||
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
char * src0_ddc = (char *) src0->data;
|
||||||
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
char * src1_ddc = (char *) src1->data;
|
||||||
|
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[ctx.device];
|
|
||||||
char * src1_ddc = (char *) src1_extra->data_device[ctx.device];
|
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue