From 67e6b3cb7d7b14069031d40351e916e532e94dd2 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Tue, 23 Jan 2024 03:32:09 +0000 Subject: [PATCH] align pr4766 --- common/common.cpp | 8 ++++- ggml-sycl.cpp | 79 +++++++++++++++++++++++++++++++++++------------ ggml-sycl.h | 3 -- llama.cpp | 15 ++++----- 4 files changed, 74 insertions(+), 31 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b26daf2fd..abe752352 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -651,9 +651,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CLBLAS_SYCL +#else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CLBLAS_SYCL + } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) + params.mul_mat_q = false; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS or SYCL. Disabling mul_mat_q kernels has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 9253c9534..499511555 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -11621,17 +11621,23 @@ catch (sycl::exception const &exc) { #define UNUSED GGML_UNUSED +struct ggml_backend_sycl_context { + int device; + std::string name; +}; + // sycl buffer -struct ggml_backend_buffer_context_sycl { +struct ggml_backend_sycl_buffer_context { int device; void * dev_ptr = nullptr; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; + std::string name; - ggml_backend_buffer_context_sycl(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + ggml_backend_sycl_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} - ~ggml_backend_buffer_context_sycl() { + ~ ggml_backend_sycl_buffer_context() { delete[] temp_tensor_extras; } @@ -11649,9 +11655,18 @@ struct ggml_backend_buffer_context_sycl { } }; +GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; +} + static void ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); int device_index = get_device_index_by_id(ctx->device); const dpct::queue_ptr stream = g_syclStreams[device_index][0]; @@ -11667,13 +11682,13 @@ catch (sycl::exception const &exc) { } static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; return ctx->dev_ptr; } static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { assert(tensor->view_src->buffer->buft == buffer->buft); @@ -11719,7 +11734,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t size) try { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); int device_index = get_device_index_by_id(ctx->device); @@ -11744,7 +11759,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t size) try { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); int device_index = get_device_index_by_id(ctx->device); @@ -11766,7 +11781,7 @@ catch (sycl::exception const &exc) { static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { - ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); int device_index = get_device_index_by_id(ctx->device); @@ -11784,18 +11799,29 @@ catch (sycl::exception const &exc) { std::exit(1); } -static struct ggml_backend_buffer_i sycl_backend_buffer_interface = { +static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { + /* .get_name = */ ggml_backend_sycl_buffer_get_name, /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_buffer_get_base, /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, - /* .cpy_tensor_from = */ NULL, - /* .cpy_tensor_to = */ NULL, + /* .cpy_tensor = */ NULL, /* .clear = */ ggml_backend_sycl_buffer_clear, + /* .reset = */ NULL, }; // sycl buffer type +struct ggml_backend_sycl_buffer_type_context { + int device; + std::string name; +}; + +GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, @@ -11811,9 +11837,9 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( size, *stream))); - ggml_backend_buffer_context_sycl * ctx = new ggml_backend_buffer_context_sycl(device, dev_ptr); + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(device, dev_ptr); - return ggml_backend_buffer_init(buft, sycl_backend_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -11827,7 +11853,7 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty UNUSED(buft); } -static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -11854,6 +11880,7 @@ static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_t } static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { + /* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, @@ -11881,6 +11908,18 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { // host buffer type +GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Host"; + + UNUSED(buft); +} + +GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_SYCL_NAME "_Host"; + + UNUSED(buffer); +} + static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); } @@ -11904,6 +11943,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = { /* .iface = */ { + /* .get_name = */ ggml_backend_sycl_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_sycl_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, @@ -11990,7 +12030,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, const ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); return nullptr; @@ -12180,14 +12220,13 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten UNUSED(backend); } -static ggml_backend_i sycl_backend_i = { +static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, /* .free = */ ggml_backend_sycl_free, /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ ggml_backend_sycl_graph_plan_create, /* .graph_plan_free = */ ggml_backend_sycl_graph_plan_free, @@ -12212,7 +12251,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) { }; ggml_backend_t sycl_backend = new ggml_backend { - /* .interface = */ sycl_backend_i, + /* .interface = */ ggml_backend_sycl_interface, /* .context = */ ctx }; diff --git a/ggml-sycl.h b/ggml-sycl.h index e4379a987..3dd2bc205 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -2386,7 +2386,6 @@ namespace dpct const void *beta, void *c, library_data_t c_type, int ldc, library_data_t scaling_type) { - bool matched = false; if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { @@ -2542,7 +2541,6 @@ namespace dpct #ifdef DPCT_USM_LEVEL_NONE throw std::runtime_error("this API is unsupported when USM level is none"); #else - bool matched = false; if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { @@ -2712,7 +2710,6 @@ namespace dpct int ldc, long long int stride_c, int batch_size, library_data_t scaling_type) { - bool matched = false; if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { diff --git a/llama.cpp b/llama.cpp index eb9426f44..2dfff45c3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6703,7 +6703,7 @@ static int llama_decode_internal( } const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; - if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) { + if (ggml_cpu_has_cublas() && fully_offloaded) { n_threads = 1; } @@ -9939,13 +9939,14 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_SYCL) if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_sycl_init(0); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL backend\n", __func__); - } + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); } - - #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) {