diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 870676c96..bf36355d4 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -63,6 +63,10 @@ static int g_ggml_sycl_debug=0; static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +static void crash(){ + int *ptr = NULL; + *ptr = 0; +} static void ggml_sycl_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); @@ -358,22 +362,21 @@ struct ggml_tensor_extra_gpu { inline dpct::err0 ggml_sycl_set_device(const int device) try { int current_device; + SYCL_CHECK(CHECK_TRY_ERROR( current_device = dpct::dev_mgr::instance().current_device_id())); + // GGML_SYCL_DEBUG("ggml_sycl_set_device device=%d, current_device=%d\n", device, current_device); if (device == current_device) { return 0; } - /* - DPCT1093:53: The "device" device may be not the one intended for use. Adjust - the selected device if needed. - */ return CHECK_TRY_ERROR(dpct::select_device(device)); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + crash(); std::exit(1); } @@ -6391,6 +6394,13 @@ int get_device_index_by_id(int id){ return res; } +int get_device_id_by_index(int index){ + int res = g_device_caps[index].device_id; + GGML_ASSERT(res>=0); + return res; +} + + int get_current_device_index(){ return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id()); } @@ -8258,12 +8268,13 @@ bool ggml_sycl_loaded(void) { } void print_devices(){ int device_count = dpct::dev_mgr::instance().device_count(); - fprintf(stderr, "%s: found %d SYCL devices:\n", __func__, device_count); + fprintf(stderr, "\n%s: found %d SYCL devices:\n", __func__, device_count); for (int id = 0; id < device_count; ++id) { dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( prop, dpct::dev_mgr::instance().get_device(id)))); - fprintf(stderr, " Device %d: %s, compute capability %d.%d, max compute_units %d, max work group size %d, max sub group size %d, global mem size %lu\n", id, + sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + fprintf(stderr, " Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id, prop.get_name(), prop.get_major_version(), prop.get_minor_version(), prop.get_max_compute_units(), @@ -8272,6 +8283,7 @@ void print_devices(){ prop.get_global_mem_size() ); } + fprintf(stderr, "\n"); } int get_sycl_env(const char* env_name, int default_val){ @@ -8302,7 +8314,7 @@ void ggml_init_sycl() try { printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug); - int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0); + int user_device_id = get_sycl_env("GGML_SYCL_DEVICE", 0); print_devices(); @@ -8341,7 +8353,7 @@ void ggml_init_sycl() try { int device_inx = -1; for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_number) continue; + if(id!=user_device_id) continue; device_inx++; int device_vmm = 0; @@ -8369,14 +8381,14 @@ void ggml_init_sycl() try { } device_inx = -1; for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_number) continue; + if(id!=user_device_id) continue; device_inx++; g_tensor_split[device_inx] /= total_vram; } device_inx = -1; for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_number) continue; + if(id!=user_device_id) continue; device_inx++; SYCL_CHECK(ggml_sycl_set_device(id)); @@ -8406,9 +8418,9 @@ void ggml_init_sycl() try { //hardcode, force set to 1 device g_device_count = 1; - ggml_sycl_set_main_device(user_device_number); - ggml_sycl_set_device(user_device_number); - fprintf(stderr, "Using Device %d\n", user_device_number); + ggml_sycl_set_main_device(user_device_id); + ggml_sycl_set_device(user_device_id); + fprintf(stderr, "Using Device %d\n", user_device_id); // for (int id = 0; id < g_all_sycl_device_count; ++id) { // GGML_SYCL_DEBUG("id=%d g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id, @@ -9583,7 +9595,8 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, ggml_sycl_set_device(g_main_device); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; - GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p\n", g_main_device_index, src0); + GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p main_stream=%p src0_on_device=%d\n", + g_main_device_index, src0, main_stream, src0_on_device); if (src0_on_device) { src0_ddf = (float *) src0_extra->data_device[g_main_device_index]; @@ -9609,6 +9622,8 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, dst_ddf = dst_f.alloc(ggml_nelements(dst)); } + GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n", + src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); // do the computation op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); /* @@ -9648,12 +9663,12 @@ static void ggml_sycl_set_peer_access(const int n_tokens) { #ifdef NDEBUG for (int id = 0; id < g_device_count; ++id) { - SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id)); + SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); // SYCL_CHECK(syclDeviceSynchronize()); } for (int id = 0; id < g_device_count; ++id) { - SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id)); + SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); int device_id = g_device_caps[id].device_id; for (int id_other = 0; id_other < g_device_count; ++id_other) { @@ -9786,7 +9801,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; - ggml_sycl_set_device(id); + ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; if (src0_on_device && src0_is_contiguous) { @@ -9852,7 +9867,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; const int64_t row_diff = row_high[id] - row_low[id]; - ggml_sycl_set_device(id); + ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][is]; // wait for main GPU data if necessary @@ -9983,7 +9998,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) { continue; } - SYCL_CHECK(ggml_sycl_set_device(id)); + SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); // free buffers again when done if (dst_as[id] > 0) { @@ -10978,8 +10993,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { if (backend == GGML_BACKEND_GPU && id != g_main_device_index) { continue; } - - ggml_sycl_set_device(id); + ggml_sycl_set_device(get_device_id_by_index(id)); int64_t row_low, row_high; if (backend == GGML_BACKEND_GPU) { @@ -11059,14 +11073,14 @@ void ggml_sycl_free_data(struct ggml_tensor *tensor) try { for (int64_t id = 0; id < g_device_count; ++id) { if (extra->data_device[id] != nullptr) { - SYCL_CHECK(ggml_sycl_set_device(id)); + SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( extra->data_device[id], dpct::get_in_order_queue()))); } for (int64_t is = 0; is < MAX_STREAMS; ++is) { if (extra->events[id][is] != nullptr) { - SYCL_CHECK(ggml_sycl_set_device(id)); + SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); SYCL_CHECK(CHECK_TRY_ERROR( dpct::destroy_event(extra->events[id][is]))); }