implemented missing SYCL event APIs
This commit is contained in:
parent
323df41251
commit
002e457788
2 changed files with 94 additions and 32 deletions
|
@ -4927,8 +4927,8 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||||
|
|
||||||
GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
(char *)tensor->data + offset, data, size).wait()));
|
(stream)->memcpy((char *)tensor->data + offset, data, size)));
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -5181,13 +5181,73 @@ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, gg
|
||||||
return buft_ctx->device == sycl_ctx->device;
|
return buft_ctx->device == sycl_ctx->device;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_backend_event_t
|
||||||
|
ggml_backend_sycl_event_new(ggml_backend_t backend) {
|
||||||
|
ggml_backend_sycl_context *sycl_ctx =
|
||||||
|
(ggml_backend_sycl_context *)backend->context;
|
||||||
|
|
||||||
|
sycl::event *event_ptr = new sycl::event();
|
||||||
|
|
||||||
|
return new ggml_backend_event{
|
||||||
|
/* .backend = */ backend,
|
||||||
|
/* .context = */ event_ptr,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_sycl_event_free(ggml_backend_event_t event) try {
|
||||||
|
if (event == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event->context != nullptr) {
|
||||||
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||||
|
delete sycl_event;
|
||||||
|
event->context = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete event;
|
||||||
|
} catch (sycl::exception const &exc) {
|
||||||
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
<< ", line:" << __LINE__ << std::endl;
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_sycl_event_record(ggml_backend_event_t event) try {
|
||||||
|
if (event == nullptr || event->context == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_sycl_context *sycl_ctx =
|
||||||
|
(ggml_backend_sycl_context *)event->backend->context;
|
||||||
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||||
|
|
||||||
|
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||||
|
// Record the current state of the queue
|
||||||
|
*sycl_event = stream->ext_oneapi_submit_barrier();
|
||||||
|
} catch (sycl::exception const &exc) {
|
||||||
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
<< ", line:" << __LINE__ << std::endl;
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_sycl_event_synchronize(ggml_backend_event_t event) {
|
||||||
|
if (event == nullptr || event->context == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_i ggml_backend_sycl_interface = {
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_name,
|
/* .get_name = */ ggml_backend_sycl_name,
|
||||||
/* .free = */ ggml_backend_sycl_free,
|
/* .free = */ ggml_backend_sycl_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
||||||
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
||||||
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
||||||
/* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
||||||
|
// // TODO: update for the new
|
||||||
|
// interface
|
||||||
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
@ -5197,11 +5257,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ ggml_backend_sycl_event_new,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ ggml_backend_sycl_event_free,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ ggml_backend_sycl_event_record,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
/* .event_synchronize = */ NULL,
|
/* .event_synchronize = */ ggml_backend_sycl_event_synchronize,
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_sycl_guid() {
|
static ggml_guid_t ggml_backend_sycl_guid() {
|
||||||
|
|
|
@ -5082,9 +5082,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
|
||||||
#if defined(GGML_USE_SYCL)
|
|
||||||
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
||||||
// NVMe raid configurations might require more / larger buffers.
|
// NVMe raid configurations might require more / larger buffers.
|
||||||
constexpr size_t n_buffers = 4;
|
constexpr size_t n_buffers = 4;
|
||||||
|
@ -5092,7 +5090,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_t> host_buffers;
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
||||||
std::vector<void*> host_ptrs;
|
std::vector<void*> host_ptrs;
|
||||||
// std::vector<ggml_backend_event_t> events;
|
std::vector<ggml_backend_event_t> events;
|
||||||
size_t buffer_idx = 0; // buffer to use for async loads
|
size_t buffer_idx = 0; // buffer to use for async loads
|
||||||
|
|
||||||
ggml_backend_t sycl_backend = nullptr;
|
ggml_backend_t sycl_backend = nullptr;
|
||||||
|
@ -5116,6 +5114,7 @@ struct llama_model_loader {
|
||||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||||
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
||||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
||||||
|
events.emplace_back(ggml_backend_event_new(sycl_backend));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5197,24 +5196,29 @@ struct llama_model_loader {
|
||||||
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#elif defined(GGML_USE_SYCL)
|
||||||
#if defined(GGML_USE_SYCL)
|
// If sycl_backend is valid load the tensor in chunks to
|
||||||
// If sycl_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
// pinned memory and upload the buffers asynchronously to the
|
||||||
if (sycl_backend) {
|
// GPU.
|
||||||
file->seek(weight->offs, SEEK_SET);
|
if (sycl_backend) {
|
||||||
|
file->seek(weight->offs, SEEK_SET);
|
||||||
|
|
||||||
size_t bytes_read = 0;
|
size_t bytes_read = 0;
|
||||||
|
|
||||||
while (bytes_read < n_size) {
|
while (bytes_read < n_size) {
|
||||||
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
size_t read_iteration =
|
||||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
std::min<size_t>(buffer_size, n_size - bytes_read);
|
||||||
ggml_backend_tensor_set_async(sycl_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||||
bytes_read += read_iteration;
|
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||||
++buffer_idx;
|
ggml_backend_tensor_set_async(sycl_backend, cur,
|
||||||
buffer_idx %= n_buffers;
|
host_ptrs[buffer_idx],
|
||||||
}
|
bytes_read, read_iteration);
|
||||||
|
ggml_backend_event_record(events[buffer_idx]);
|
||||||
|
bytes_read += read_iteration;
|
||||||
|
++buffer_idx;
|
||||||
|
buffer_idx %= n_buffers;
|
||||||
}
|
}
|
||||||
else
|
} else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
read_buf.resize(n_size);
|
read_buf.resize(n_size);
|
||||||
|
@ -5241,15 +5245,13 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
ggml_backend_free(cuda_backend);
|
ggml_backend_free(cuda_backend);
|
||||||
}
|
}
|
||||||
#endif
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
|
||||||
#if defined(GGML_USE_SYCL)
|
|
||||||
// free temporary resources used for async cuda uploads
|
// free temporary resources used for async cuda uploads
|
||||||
if (sycl_backend) {
|
if (sycl_backend) {
|
||||||
for (size_t idx = 0; idx < n_buffers;++idx) {
|
for (size_t idx = 0; idx < n_buffers;++idx) {
|
||||||
// ggml_backend_event_synchronize(events[idx]);
|
ggml_backend_event_synchronize(events[idx]);
|
||||||
// ggml_backend_event_free(events[idx]);
|
ggml_backend_event_free(events[idx]);
|
||||||
ggml_backend_buffer_free(host_buffers[idx]);
|
ggml_backend_buffer_free(host_buffers[idx]);
|
||||||
}
|
}
|
||||||
ggml_backend_free(sycl_backend);
|
ggml_backend_free(sycl_backend);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue