step 5 format device and print
This commit is contained in:
parent
da752edaf5
commit
6dd32789b4
2 changed files with 138 additions and 165 deletions
292
ggml-sycl.cpp
292
ggml-sycl.cpp
|
@ -59,20 +59,10 @@ static int g_ggml_sycl_debug=0;
|
||||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
|
|
||||||
static const char *cublas_get_error_str(const int err) {
|
static void ggml_sycl_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
|
||||||
/*
|
fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
|
||||||
DPCT1009:48: SYCL uses exceptions to report errors and does not use the
|
|
||||||
error codes. The original code was commented out and a warning string
|
|
||||||
was inserted. You need to rewrite this code.
|
|
||||||
*/
|
|
||||||
return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
|
|
||||||
;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
|
|
||||||
fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
|
|
||||||
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
|
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
|
||||||
GGML_ASSERT(!"CUDA error");
|
GGML_ASSERT(!"SYCL error");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -86,36 +76,11 @@ DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
|
||||||
codes. The original code was commented out and a warning string was inserted.
|
codes. The original code was commented out and a warning string was inserted.
|
||||||
You need to rewrite this code.
|
You need to rewrite this code.
|
||||||
*/
|
*/
|
||||||
#define CUDA_CHECK(err) do { \
|
#define SYCL_CHECK(err) do { \
|
||||||
auto err_ = (err); if (err_ != 0) ggml_cuda_error( \
|
auto err_ = (err); if (err_ != 0) ggml_sycl_error( \
|
||||||
#err, __func__, __FILE__, __LINE__, \
|
#err, __func__, __FILE__, __LINE__, \
|
||||||
"cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/); \
|
"Meet error in this line code!"); \
|
||||||
} while (0)
|
} while (0)
|
||||||
#define CUBLAS_CHECK(err) \
|
|
||||||
do { auto err_ = (err); if (err_ != 0) \
|
|
||||||
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \
|
|
||||||
cublas_get_error_str(err_)); } while (0)
|
|
||||||
|
|
||||||
|
|
||||||
static const char *cu_get_error_str(int err) {
|
|
||||||
const char * err_str;
|
|
||||||
/*
|
|
||||||
DPCT1007:49: Migration of cuGetErrorString is not supported.
|
|
||||||
*/
|
|
||||||
// cuGetErrorString(err, &err_str);
|
|
||||||
return err_str;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1001:67: The statement could not be removed.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
|
|
||||||
*/
|
|
||||||
#define CU_CHECK(err) \
|
|
||||||
do { auto err_ = (err); \
|
|
||||||
if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \
|
|
||||||
cu_get_error_str(err_)); } while (0)
|
|
||||||
|
|
||||||
|
|
||||||
#if DPCT_COMPAT_RT_VERSION >= 11100
|
#if DPCT_COMPAT_RT_VERSION >= 11100
|
||||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||||
|
@ -401,7 +366,7 @@ struct ggml_tensor_extra_gpu {
|
||||||
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
||||||
inline dpct::err0 ggml_cuda_set_device(const int device) try {
|
inline dpct::err0 ggml_cuda_set_device(const int device) try {
|
||||||
int current_device;
|
int current_device;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
current_device = dpct::dev_mgr::instance().current_device_id()));
|
current_device = dpct::dev_mgr::instance().current_device_id()));
|
||||||
|
|
||||||
if (device == current_device) {
|
if (device == current_device) {
|
||||||
|
@ -6420,7 +6385,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -6535,7 +6500,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -6650,7 +6615,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -6765,7 +6730,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -6880,7 +6845,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -6995,7 +6960,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -7118,7 +7083,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -7246,7 +7211,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -7367,7 +7332,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -7488,7 +7453,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -8073,7 +8038,7 @@ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
|
static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
int nnz = 0;
|
int nnz = 0;
|
||||||
|
@ -8115,7 +8080,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
|
||||||
void * ptr;
|
void * ptr;
|
||||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
|
DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
|
||||||
look_ahead_size, dpct::get_in_order_queue())));
|
look_ahead_size, dpct::get_in_order_queue())));
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
|
@ -8135,7 +8100,7 @@ catch (sycl::exception const &exc) {
|
||||||
static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
|
static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
|
|
||||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||||
|
@ -8147,7 +8112,7 @@ static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
|
SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
|
||||||
g_cuda_pool_size[id] -= size;
|
g_cuda_pool_size[id] -= size;
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
|
@ -8179,7 +8144,7 @@ catch (sycl::exception const &exc) {
|
||||||
static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
|
static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
|
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
|
@ -8199,7 +8164,7 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
|
static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
if (g_device_caps[id].vmm) {
|
if (g_device_caps[id].vmm) {
|
||||||
return ggml_cuda_pool_malloc_vmm(size, actual_size);
|
return ggml_cuda_pool_malloc_vmm(size, actual_size);
|
||||||
|
@ -8215,7 +8180,7 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
static void ggml_cuda_pool_free(void *ptr, size_t size) try {
|
static void ggml_cuda_pool_free(void *ptr, size_t size) try {
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
if (g_device_caps[id].vmm) {
|
if (g_device_caps[id].vmm) {
|
||||||
ggml_cuda_pool_free_vmm(ptr, size);
|
ggml_cuda_pool_free_vmm(ptr, size);
|
||||||
|
@ -8270,9 +8235,10 @@ bool ggml_cublas_loaded(void) {
|
||||||
}
|
}
|
||||||
void print_devices(){
|
void print_devices(){
|
||||||
int device_count = dpct::dev_mgr::instance().device_count();
|
int device_count = dpct::dev_mgr::instance().device_count();
|
||||||
|
fprintf(stderr, "%s: found %d SYCL devices:\n", __func__, device_count);
|
||||||
for (int id = 0; id < device_count; ++id) {
|
for (int id = 0; id < device_count; ++id) {
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
||||||
prop, dpct::dev_mgr::instance().get_device(id))));
|
prop, dpct::dev_mgr::instance().get_device(id))));
|
||||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id,
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id,
|
||||||
prop.get_name(), prop.get_major_version(),
|
prop.get_name(), prop.get_major_version(),
|
||||||
|
@ -8305,7 +8271,7 @@ void ggml_init_cublas() try {
|
||||||
|
|
||||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||||
|
|
||||||
printf("g_ggml_sycl_debug=%d\n", g_ggml_sycl_debug);
|
printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug);
|
||||||
|
|
||||||
int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0);
|
int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0);
|
||||||
|
|
||||||
|
@ -8321,17 +8287,19 @@ void ggml_init_cublas() try {
|
||||||
|
|
||||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
|
||||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
#if defined(GGML_SYCL_FP16)
|
||||||
|
fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(CUDA_USE_TENSOR_CORES)
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
|
||||||
|
|
||||||
//zjy hardcode, force set to 1 device
|
//zjy hardcode, force set to 1 device
|
||||||
g_device_count = 1;
|
g_device_count = 1;
|
||||||
|
@ -8341,8 +8309,8 @@ void ggml_init_cublas() try {
|
||||||
g_device_caps[id].vmm = !!device_vmm;
|
g_device_caps[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
||||||
prop, dpct::dev_mgr::instance().get_device(id))));
|
prop, dpct::dev_mgr::instance().get_device(user_device_number))));
|
||||||
/*
|
/*
|
||||||
DPCT1005:86: The SYCL device version is different from CUDA Compute
|
DPCT1005:86: The SYCL device version is different from CUDA Compute
|
||||||
Compatibility. You may need to rewrite this code.
|
Compatibility. You may need to rewrite this code.
|
||||||
|
@ -8369,7 +8337,7 @@ void ggml_init_cublas() try {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(user_device_number));
|
SYCL_CHECK(ggml_cuda_set_device(user_device_number));
|
||||||
|
|
||||||
// create cuda streams
|
// create cuda streams
|
||||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||||
|
@ -8377,25 +8345,25 @@ void ggml_init_cublas() try {
|
||||||
DPCT1025:88: The SYCL queue is created ignoring the flag and
|
DPCT1025:88: The SYCL queue is created ignoring the flag and
|
||||||
priority options.
|
priority options.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
g_cudaStreams[id][is] =
|
g_cudaStreams[id][is] =
|
||||||
dpct::get_current_device().create_queue()));
|
dpct::get_current_device().create_queue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// create cublas handle
|
// create cublas handle
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
|
||||||
&dpct::get_in_order_queue()));
|
&dpct::get_in_order_queue()));
|
||||||
/*
|
/*
|
||||||
DPCT1027:89: The call to cublasSetMathMode was replaced with 0
|
DPCT1027:89: The call to cublasSetMathMode was replaced with 0
|
||||||
because this functionality is redundant in SYCL.
|
because this functionality is redundant in SYCL.
|
||||||
*/
|
*/
|
||||||
CUBLAS_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// configure logging to stdout
|
// configure logging to stdout
|
||||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
// SYCL_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||||
ggml_cuda_set_device(user_device_number);
|
ggml_cuda_set_device(user_device_number);
|
||||||
fprintf(stderr, " set Device %d\n", user_device_number);
|
fprintf(stderr, "Using Device %d\n", user_device_number);
|
||||||
initialized = true;
|
initialized = true;
|
||||||
g_cublas_loaded = true;
|
g_cublas_loaded = true;
|
||||||
}
|
}
|
||||||
|
@ -8473,7 +8441,7 @@ catch (sycl::exception const &exc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_host_free(void *ptr) try {
|
void ggml_cuda_host_free(void *ptr) try {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
|
SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -8497,7 +8465,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
|
||||||
kind = dpct::device_to_device;
|
kind = dpct::device_to_device;
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
id = dpct::dev_mgr::instance().current_device_id()));
|
id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
src_ptr = (char *) extra->data_device[id];
|
src_ptr = (char *) extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
|
@ -8915,7 +8883,7 @@ inline void ggml_cuda_op_mul_mat_q(
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
|
@ -9155,7 +9123,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
|
@ -9197,8 +9165,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
const sycl::half alpha_f16 = 1.0f;
|
const sycl::half alpha_f16 = 1.0f;
|
||||||
const sycl::half beta_f16 = 0.0f;
|
const sycl::half beta_f16 = 0.0f;
|
||||||
|
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
|
||||||
*g_cublas_handles[id], oneapi::mkl::transpose::trans,
|
*g_cublas_handles[id], oneapi::mkl::transpose::trans,
|
||||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||||
|
@ -9224,8 +9192,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
|
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
|
SYCL_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||||
*g_cublas_handles[id], oneapi::mkl::transpose::trans,
|
*g_cublas_handles[id], oneapi::mkl::transpose::trans,
|
||||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||||
dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
|
dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
|
||||||
|
@ -9494,7 +9462,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
DPCT1010:87: SYCL uses exceptions to report errors and does not use the
|
DPCT1010:87: SYCL uses exceptions to report errors and does not use the
|
||||||
error codes. The call was replaced with 0. You need to rewrite this code.
|
error codes. The call was replaced with 0. You need to rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -9519,7 +9487,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
DPCT1010:88: SYCL uses exceptions to report errors and does not use the
|
DPCT1010:88: SYCL uses exceptions to report errors and does not use the
|
||||||
error codes. The call was replaced with 0. You need to rewrite this code.
|
error codes. The call was replaced with 0. You need to rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -9561,7 +9529,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
|
||||||
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_src1) {
|
if (use_src1) {
|
||||||
|
@ -9569,7 +9537,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
|
||||||
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dst_on_device) {
|
if (dst_on_device) {
|
||||||
|
@ -9584,16 +9552,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
|
||||||
DPCT1010:89: SYCL uses exceptions to report errors and does not use the
|
DPCT1010:89: SYCL uses exceptions to report errors and does not use the
|
||||||
error codes. The call was replaced with 0. You need to rewrite this code.
|
error codes. The call was replaced with 0. You need to rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
// copy dst to host if necessary
|
// copy dst to host if necessary
|
||||||
if (!dst_on_device) {
|
if (!dst_on_device) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::get_current_device().queues_wait_and_throw()));
|
dpct::get_current_device().queues_wait_and_throw()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9614,12 +9582,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
SYCL_CHECK(ggml_cuda_set_device(id));
|
||||||
// CUDA_CHECK(cudaDeviceSynchronize());
|
// SYCL_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
SYCL_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
||||||
if (id == id_other) {
|
if (id == id_other) {
|
||||||
|
@ -9630,12 +9598,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int can_access_peer;
|
int can_access_peer;
|
||||||
// CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
// SYCL_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||||
// if (can_access_peer) {
|
// if (can_access_peer) {
|
||||||
// if (enable_peer_access) {
|
// if (enable_peer_access) {
|
||||||
// CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
// SYCL_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||||
// } else {
|
// } else {
|
||||||
// CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
// SYCL_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
@ -9776,7 +9744,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
use the error codes. The call was replaced with 0. You need to
|
use the error codes. The call was replaced with 0. You need to
|
||||||
rewrite this code.
|
rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9791,13 +9759,13 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
// if multiple devices are used they need to wait for the main device
|
// if multiple devices are used they need to wait for the main device
|
||||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||||
if (split && used_devices > 1) {
|
if (split && used_devices > 1) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
/*
|
/*
|
||||||
DPCT1024:91: The original code returned the error code that was further
|
DPCT1024:91: The original code returned the error code that was further
|
||||||
consumed by the program logic. This original code was replaced with 0.
|
consumed by the program logic. This original code was replaced with 0.
|
||||||
You may need to rewrite the program logic consuming the error code.
|
You may need to rewrite the program logic consuming the error code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
*src0_extra->events[g_main_device][0] =
|
*src0_extra->events[g_main_device][0] =
|
||||||
g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
|
g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
|
||||||
}
|
}
|
||||||
|
@ -9821,7 +9789,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
|
|
||||||
// wait for main GPU data if necessary
|
// wait for main GPU data if necessary
|
||||||
if (split && (id != g_main_device || is != 0)) {
|
if (split && (id != g_main_device || is != 0)) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
|
||||||
{*src0_extra->events[g_main_device][0]})));
|
{*src0_extra->events[g_main_device][0]})));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9848,20 +9816,20 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
if (id != g_main_device) {
|
if (id != g_main_device) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
||||||
src1_ddq_i, src1_ddq_i_source,
|
src1_ddq_i, src1_ddq_i_source,
|
||||||
src1_ncols * src1_padded_col_size * q8_1_ts /
|
src1_ncols * src1_padded_col_size * q8_1_ts /
|
||||||
q8_1_bs)));
|
q8_1_bs)));
|
||||||
} else {
|
} else {
|
||||||
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
||||||
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
||||||
src1_ddf_i, src1_ddf_i_source,
|
src1_ddf_i, src1_ddf_i_source,
|
||||||
src1_ncols * ne10 * sizeof(float))));
|
src1_ncols * ne10 * sizeof(float))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
SYCL_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -9874,11 +9842,11 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
not use the error codes. The call was replaced with 0. You
|
not use the error codes. The call was replaced with 0. You
|
||||||
need to rewrite this code.
|
need to rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
// do the computation
|
// do the computation
|
||||||
|
@ -9889,7 +9857,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
use the error codes. The call was replaced with 0. You need to
|
use the error codes. The call was replaced with 0. You need to
|
||||||
rewrite this code.
|
rewrite this code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
// copy dst to host or other device if necessary
|
// copy dst to host or other device if necessary
|
||||||
if (!dst_on_device) {
|
if (!dst_on_device) {
|
||||||
|
@ -9913,7 +9881,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||||
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
|
||||||
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
|
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
|
||||||
row_diff * sizeof(float), row_diff * sizeof(float),
|
row_diff * sizeof(float), row_diff * sizeof(float),
|
||||||
src1_ncols, kind, *stream)));
|
src1_ncols, kind, *stream)));
|
||||||
|
@ -9921,7 +9889,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||||
dhf_dst_i += src1_col_0*ne0;
|
dhf_dst_i += src1_col_0*ne0;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
stream->memcpy(dhf_dst_i, dst_dd_i,
|
stream->memcpy(dhf_dst_i, dst_dd_i,
|
||||||
src1_ncols * ne0 * sizeof(float))));
|
src1_ncols * ne0 * sizeof(float))));
|
||||||
}
|
}
|
||||||
|
@ -9935,7 +9903,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
code was replaced with 0. You may need to rewrite the
|
code was replaced with 0. You may need to rewrite the
|
||||||
program logic consuming the error code.
|
program logic consuming the error code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
*src0_extra->events[id][is] =
|
*src0_extra->events[id][is] =
|
||||||
stream->ext_oneapi_submit_barrier()));
|
stream->ext_oneapi_submit_barrier()));
|
||||||
}
|
}
|
||||||
|
@ -9947,7 +9915,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
SYCL_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
// free buffers again when done
|
// free buffers again when done
|
||||||
if (dst_as[id] > 0) {
|
if (dst_as[id] > 0) {
|
||||||
|
@ -9969,13 +9937,13 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
||||||
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (row_low[id] == row_high[id]) {
|
if (row_low[id] == row_high[id]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (int64_t is = 0; is < is_max; ++is) {
|
for (int64_t is = 0; is < is_max; ++is) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
|
g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
|
||||||
{*src0_extra->events[id][is]})));
|
{*src0_extra->events[id][is]})));
|
||||||
}
|
}
|
||||||
|
@ -9983,8 +9951,8 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::get_current_device().queues_wait_and_throw()));
|
dpct::get_current_device().queues_wait_and_throw()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10101,7 +10069,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
|
||||||
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
|
@ -10140,7 +10108,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
|
||||||
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
|
@ -10219,10 +10187,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
|
||||||
const int64_t ne1 = ggml_nelements(src1);
|
const int64_t ne1 = ggml_nelements(src1);
|
||||||
const int64_t ne = ggml_nelements(dst);
|
const int64_t ne = ggml_nelements(dst);
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
|
DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
|
@ -10291,7 +10259,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
|
||||||
int i03 = i13 / r3;
|
int i03 = i13 / r3;
|
||||||
int i02 = i12 / r2;
|
int i02 = i12 / r2;
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
SYCL_CHECK(
|
||||||
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||||
|
@ -10306,7 +10274,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
// use cublasGemmStridedBatchedEx
|
// use cublasGemmStridedBatchedEx
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
|
||||||
*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||||
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
||||||
|
@ -10353,9 +10321,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
|
||||||
error codes. The call was replaced with 0. You need to rewrite this
|
error codes. The call was replaced with 0. You need to rewrite this
|
||||||
code.
|
code.
|
||||||
*/
|
*/
|
||||||
CUDA_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
|
||||||
*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||||
(const void **)(ptrs_src.get() + 0 * ne23),
|
(const void **)(ptrs_src.get() + 0 * ne23),
|
||||||
|
@ -10534,10 +10502,10 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
||||||
const int64_t ne1 = ggml_nelements(src1);
|
const int64_t ne1 = ggml_nelements(src1);
|
||||||
const int64_t ne = ggml_nelements(dst);
|
const int64_t ne = ggml_nelements(dst);
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
SYCL_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
||||||
|
|
||||||
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
//void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||||
|
@ -10607,9 +10575,9 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
||||||
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
|
||||||
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
|
||||||
);
|
);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
SYCL_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
SYCL_CHECK(
|
||||||
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
|
||||||
|
@ -10658,9 +10626,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_GPU) {
|
if (ids->backend == GGML_BACKEND_GPU) {
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
|
||||||
} else {
|
} else {
|
||||||
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
||||||
}
|
}
|
||||||
|
@ -10691,8 +10659,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
//int32_t row_id;
|
||||||
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
//SYCL_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||||
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
//SYCL_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||||
|
|
||||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
|
@ -10735,7 +10703,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
||||||
src1_original + i01 * nb11, nb11)));
|
src1_original + i01 * nb11, nb11)));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
|
@ -10768,7 +10736,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
|
||||||
dst_original + i01 * nb1,
|
dst_original + i01 * nb1,
|
||||||
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
|
@ -10777,7 +10745,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
|
SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
|
@ -10821,7 +10789,7 @@ static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
const int64_t nb11 = src1->nb[1];
|
const int64_t nb11 = src1->nb[1];
|
||||||
const int64_t nb12 = src1->nb[2];
|
const int64_t nb12 = src1->nb[2];
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
|
@ -10958,19 +10926,19 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||||
}
|
}
|
||||||
|
|
||||||
char * buf;
|
char * buf;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
|
SYCL_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
|
||||||
size, dpct::get_in_order_queue())));
|
size, dpct::get_in_order_queue())));
|
||||||
char * buf_host = (char *)data + offset_split;
|
char * buf_host = (char *)data + offset_split;
|
||||||
|
|
||||||
// set padding to 0 to avoid possible NaN values
|
// set padding to 0 to avoid possible NaN values
|
||||||
if (size > original_size) {
|
if (size > original_size) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::get_in_order_queue()
|
dpct::get_in_order_queue()
|
||||||
.memset(buf + original_size, 0, size - original_size)
|
.memset(buf + original_size, 0, size - original_size)
|
||||||
.wait()));
|
.wait()));
|
||||||
}
|
}
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
||||||
.memcpy(buf, buf_host, original_size)
|
.memcpy(buf, buf_host, original_size)
|
||||||
.wait()));
|
.wait()));
|
||||||
|
|
||||||
|
@ -10978,7 +10946,7 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
|
SYCL_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
|
||||||
new sycl::event()));
|
new sycl::event()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11001,15 +10969,15 @@ void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (extra->data_device[id] != nullptr) {
|
if (extra->data_device[id] != nullptr) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
SYCL_CHECK(ggml_cuda_set_device(id));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
|
SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(
|
||||||
extra->data_device[id], dpct::get_in_order_queue())));
|
extra->data_device[id], dpct::get_in_order_queue())));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||||
if (extra->events[id][is] != nullptr) {
|
if (extra->events[id][is] != nullptr) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
SYCL_CHECK(ggml_cuda_set_device(id));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::destroy_event(extra->events[id][is])));
|
dpct::destroy_event(extra->events[id][is])));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11070,7 +11038,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||||
force_inplace;
|
force_inplace;
|
||||||
const size_t size = ggml_nbytes(tensor);
|
const size_t size = ggml_nbytes(tensor);
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
|
@ -11093,7 +11061,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||||
|
|
||||||
char * data = (char *) g_scratch_buffer;
|
char * data = (char *) g_scratch_buffer;
|
||||||
if (data == nullptr) {
|
if (data == nullptr) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
data = (char *)sycl::malloc_device(
|
data = (char *)sycl::malloc_device(
|
||||||
g_scratch_size, dpct::get_in_order_queue())));
|
g_scratch_size, dpct::get_in_order_queue())));
|
||||||
g_scratch_buffer = data;
|
g_scratch_buffer = data;
|
||||||
|
@ -11106,9 +11074,9 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||||
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
||||||
} else { // allocate new buffers outside of scratch
|
} else { // allocate new buffers outside of scratch
|
||||||
void * data;
|
void * data;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
|
SYCL_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
|
||||||
size, dpct::get_in_order_queue())));
|
size, dpct::get_in_order_queue())));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::get_in_order_queue().memset(data, 0, size).wait()));
|
dpct::get_in_order_queue().memset(data, 0, size).wait()));
|
||||||
extra = new ggml_tensor_extra_gpu;
|
extra = new ggml_tensor_extra_gpu;
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
@ -11130,7 +11098,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
|
||||||
}
|
}
|
||||||
if (g_scratch_buffer == nullptr) {
|
if (g_scratch_buffer == nullptr) {
|
||||||
ggml_cuda_set_device(g_main_device);
|
ggml_cuda_set_device(g_main_device);
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
|
DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
|
||||||
g_scratch_size, dpct::get_in_order_queue())));
|
g_scratch_size, dpct::get_in_order_queue())));
|
||||||
}
|
}
|
||||||
|
@ -11164,8 +11132,8 @@ void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
SYCL_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
||||||
.memcpy(extra->data_device[g_main_device],
|
.memcpy(extra->data_device[g_main_device],
|
||||||
tensor->data, ggml_nbytes(tensor))
|
tensor->data, ggml_nbytes(tensor))
|
||||||
.wait()));
|
.wait()));
|
||||||
|
@ -11202,7 +11170,7 @@ void ggml_cuda_set_main_device(const int main_device) try {
|
||||||
if (g_main_device != main_device && g_device_count > 1) {
|
if (g_main_device != main_device && g_device_count > 1) {
|
||||||
g_main_device = main_device;
|
g_main_device = main_device;
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
||||||
prop, dpct::dev_mgr::instance().get_device(g_main_device))));
|
prop, dpct::dev_mgr::instance().get_device(g_main_device))));
|
||||||
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
|
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
|
||||||
g_main_device, prop.get_name());
|
g_main_device, prop.get_name());
|
||||||
|
@ -11228,7 +11196,7 @@ void ggml_cuda_free_scratch() try {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
|
sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
|
||||||
g_scratch_buffer = nullptr;
|
g_scratch_buffer = nullptr;
|
||||||
}
|
}
|
||||||
|
@ -11413,7 +11381,7 @@ catch (sycl::exception const &exc) {
|
||||||
void ggml_cuda_get_device_description(int device, char *description,
|
void ggml_cuda_get_device_description(int device, char *description,
|
||||||
size_t description_size) try {
|
size_t description_size) try {
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
|
||||||
prop, dpct::dev_mgr::instance().get_device(device))));
|
prop, dpct::dev_mgr::instance().get_device(device))));
|
||||||
snprintf(description, description_size, "%s", prop.get_name());
|
snprintf(description, description_size, "%s", prop.get_name());
|
||||||
}
|
}
|
||||||
|
@ -11460,7 +11428,7 @@ struct ggml_backend_buffer_context_cuda {
|
||||||
static void
|
static void
|
||||||
ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
|
DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
@ -11503,7 +11471,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||||
|
|
||||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
|
||||||
(char *)tensor->data + original_size, 0,
|
(char *)tensor->data + original_size, 0,
|
||||||
padded_size - original_size)));
|
padded_size - original_size)));
|
||||||
}
|
}
|
||||||
|
@ -11526,10 +11494,10 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
||||||
|
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
||||||
.memcpy((char *)tensor->data + offset, data, size)
|
.memcpy((char *)tensor->data + offset, data, size)
|
||||||
.wait()));
|
.wait()));
|
||||||
|
@ -11549,10 +11517,10 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(
|
SYCL_CHECK(DPCT_CHECK_ERROR(
|
||||||
dpct::get_in_order_queue()
|
dpct::get_in_order_queue()
|
||||||
.memcpy(data, (const char *)tensor->data + offset, size)
|
.memcpy(data, (const char *)tensor->data + offset, size)
|
||||||
.wait()));
|
.wait()));
|
||||||
|
@ -11568,10 +11536,10 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(
|
SYCL_CHECK(
|
||||||
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
|
||||||
.memset(ctx->dev_ptr, value, buffer->size)
|
.memset(ctx->dev_ptr, value, buffer->size)
|
||||||
.wait()));
|
.wait()));
|
||||||
}
|
}
|
||||||
|
@ -11604,7 +11572,7 @@ ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||||
|
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
SYCL_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
||||||
size, dpct::get_in_order_queue())));
|
size, dpct::get_in_order_queue())));
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
||||||
|
@ -11746,7 +11714,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
|
||||||
(char *)tensor->data + offset, data, size)));
|
(char *)tensor->data + offset, data, size)));
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
|
@ -11764,7 +11732,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
|
||||||
data, (const char *)tensor->data + offset, size)));
|
data, (const char *)tensor->data + offset, size)));
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
|
@ -11776,7 +11744,7 @@ catch (sycl::exception const &exc) {
|
||||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
|
SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
11
run.sh
11
run.sh
|
@ -5,9 +5,14 @@ INPUT1="The process of Origami seems simple at the first glance, but in fact, it
|
||||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
export GGML_SYCL_DEVIC=0
|
if [ $# -gt 0 ]; then
|
||||||
export GGML_SYCL_DEBUG=1
|
export GGML_SYCL_DEVICE=$1
|
||||||
export GGML_SYCL_LIST_DEVICE=1
|
else
|
||||||
|
export GGML_SYCL_DEVICE=0
|
||||||
|
fi
|
||||||
|
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
||||||
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
#export GGML_SYCL_LIST_DEVICE=1
|
||||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
|
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
|
||||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
|
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue