ggml-backend : add device and backend reg interfaces (#9707)
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
a39ab216aa
commit
c83ad6d01e
28 changed files with 1809 additions and 1303 deletions
|
@ -99,11 +99,11 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|||
int id = -1; // in case cudaGetDevice fails
|
||||
cudaGetDevice(&id);
|
||||
|
||||
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
||||
GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
||||
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
||||
// abort with GGML_ASSERT to get a stack trace
|
||||
GGML_ABORT("CUDA error");
|
||||
// abort with GGML_ABORT to get a stack trace
|
||||
GGML_ABORT(GGML_CUDA_NAME " error");
|
||||
}
|
||||
|
||||
// this is faster on Windows
|
||||
|
@ -327,7 +327,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
return;
|
||||
}
|
||||
}
|
||||
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaFree(ptr));
|
||||
pool_size -= size;
|
||||
|
@ -457,26 +457,26 @@ struct ggml_backend_cuda_buffer_context {
|
|||
}
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->dev_ptr;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
|
@ -496,7 +496,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
@ -504,7 +504,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer
|
|||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
@ -512,7 +512,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
|
|||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
@ -520,7 +520,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
|
|||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
||||
|
@ -541,7 +541,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
|
|||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
@ -550,7 +550,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe
|
|||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||
|
@ -569,17 +569,17 @@ struct ggml_backend_cuda_buffer_type_context {
|
|||
std::string name;
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
||||
return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
ggml_cuda_set_device(buft_ctx->device);
|
||||
|
@ -600,13 +600,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
|
@ -621,8 +621,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||
static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
|
@ -630,7 +630,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
|
@ -643,9 +643,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|||
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_cuda_buffer_type_initialized) {
|
||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
||||
for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
|
||||
ggml_backend_cuda_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
|
||||
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
||||
};
|
||||
}
|
||||
|
@ -715,7 +716,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
|||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
|
@ -726,19 +727,19 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
|||
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
||||
return (void *)0x1000;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
|
@ -786,7 +787,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
@ -824,7 +825,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
@ -862,12 +863,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(value);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||
|
@ -882,17 +883,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|||
|
||||
// cuda split buffer type
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
||||
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||
// instead, we allocate them for each tensor separately in init_tensor
|
||||
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
||||
|
@ -902,13 +903,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc
|
|||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
|
||||
size_t total_size = 0;
|
||||
|
@ -935,14 +936,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|||
return total_size;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
||||
static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
|
@ -950,7 +951,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
|
@ -979,6 +980,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
|
|||
|
||||
struct ggml_backend_buffer_type buft {
|
||||
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
||||
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
||||
};
|
||||
|
||||
|
@ -988,19 +990,19 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
|
|||
|
||||
// host buffer type
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_CUDA_NAME "_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
|
@ -1022,7 +1024,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|||
return ptr;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_cuda_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
|
@ -1038,7 +1040,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_
|
|||
return buffer;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||
|
@ -1048,6 +1050,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
|
@ -2375,26 +2378,26 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
|
||||
// backend
|
||||
|
||||
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return cuda_ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
delete cuda_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
|
@ -2403,7 +2406,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
|||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
|
@ -2412,7 +2415,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
|||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
|
@ -2467,7 +2470,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||
return true;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
||||
|
@ -2526,7 +2529,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
|||
return true;
|
||||
}
|
||||
|
||||
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
@ -2798,8 +2801,187 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
if (ggml_backend_is_cuda(backend)) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
|
||||
} else {
|
||||
#if 0
|
||||
// untested
|
||||
auto wait_fn = [](void * user_data) {
|
||||
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
||||
ggml_backend_event_synchronize(event);
|
||||
};
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
||||
#endif
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_get_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ NULL, // moved to device
|
||||
/* .supports_buft = */ NULL, // moved to device
|
||||
/* .offload_op = */ NULL, // moved to device
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cuda_guid() {
|
||||
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
||||
}
|
||||
|
||||
int ggml_backend_cuda_get_device_count() {
|
||||
return ggml_cuda_info().device_count;
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
snprintf(description, description_size, "%s", prop.name);
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
ggml_cuda_set_device(device);
|
||||
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
|
||||
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaHostUnregister(buffer);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// backend device
|
||||
|
||||
struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
#else
|
||||
bool events = true;
|
||||
#endif
|
||||
|
||||
props->caps = {
|
||||
/* async */ true,
|
||||
/* host_buffer */ host_buffer,
|
||||
/* events */ events,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
GGML_UNUSED(params);
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ggml_backend_cuda_init(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ggml_backend_cuda_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(ptr);
|
||||
GGML_UNUSED(size);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// TODO: move these functions here
|
||||
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
|
@ -3004,7 +3186,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
}
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
|
@ -3014,115 +3196,170 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_cuda_split(buft)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ggml_backend_buft_is_cuda(buft)) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
return buft_ctx->device == cuda_ctx->device;
|
||||
return buft_ctx->device == dev_ctx->device;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return nullptr;
|
||||
#else
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
ggml_cuda_set_device(dev_ctx->device);
|
||||
|
||||
cudaEvent_t event;
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
||||
|
||||
return new ggml_backend_event {
|
||||
/* .backend = */ backend,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ event,
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
||||
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
||||
static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
GGML_UNUSED(dev);
|
||||
|
||||
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
||||
delete event;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
||||
|
||||
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
if (ggml_backend_is_cuda(event->backend)) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
|
||||
} else {
|
||||
#if 0
|
||||
// untested
|
||||
auto wait_fn = [](void * user_data) {
|
||||
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
||||
ggml_backend_event_synchronize(event);
|
||||
};
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
||||
#endif
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
||||
static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
GGML_UNUSED(dev);
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
||||
/* .event_new = */ ggml_backend_cuda_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_event_free,
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
||||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cuda_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cuda_guid() {
|
||||
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
||||
return &guid;
|
||||
// backend reg
|
||||
|
||||
struct ggml_backend_cuda_reg_context {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
|
||||
GGML_UNUSED(reg);
|
||||
return GGML_CUDA_NAME;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
|
||||
return ctx->devices.size();
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
|
||||
GGML_ASSERT(index < ctx->devices.size());
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_cuda_split_buffer_type;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
|
||||
return (void *)ggml_backend_cuda_register_host_buffer;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
||||
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
|
||||
GGML_UNUSED(reg);
|
||||
ggml_backend_cuda_log_set_callback(log_callback, user_data);
|
||||
}
|
||||
|
||||
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
||||
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
||||
/* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback,
|
||||
};
|
||||
|
||||
// backend registry
|
||||
ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
static ggml_backend_reg reg;
|
||||
static bool initialized = false;
|
||||
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
dev_ctx->device = i;
|
||||
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
||||
|
||||
ggml_cuda_set_device(i);
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .interface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
};
|
||||
ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = ggml_backend_reg {
|
||||
/* .interface = */ ggml_backend_cuda_reg_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
||||
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
||||
return nullptr;
|
||||
|
@ -3137,82 +3374,9 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .interface = */ ggml_backend_cuda_interface,
|
||||
/* .context = */ ctx
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
||||
}
|
||||
|
||||
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
||||
return ggml_cuda_info().device_count;
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
snprintf(description, description_size, "%s", prop.name);
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
ggml_cuda_set_device(device);
|
||||
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
|
||||
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaHostUnregister(buffer);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
|
||||
// backend registry
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
||||
return cuda_backend;
|
||||
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
|
||||
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
int device_count = ggml_backend_cuda_get_device_count();
|
||||
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
||||
for (int i = 0; i < device_count; i++) {
|
||||
char name[128];
|
||||
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
|
||||
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
|
||||
}
|
||||
return device_count;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue