opencl : add ggml-backend buffer type

This commit is contained in:
slaren 2024-01-09 03:14:16 +01:00
parent 4ed5f621be
commit fa7620116e
11 changed files with 419 additions and 132 deletions

View file

@ -229,6 +229,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
} else { } else {
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
ggml_backend_buffer_reset(alloc->buffer);
} }
} }

View file

@ -19,7 +19,7 @@ extern "C" {
const char * (*get_name) (ggml_backend_buffer_type_t buft); const char * (*get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory // check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
@ -37,15 +37,15 @@ extern "C" {
struct ggml_backend_buffer_i { struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer); const char * (*get_name) (ggml_backend_buffer_t buffer);
void (*free_buffer) (ggml_backend_buffer_t buffer); void (*free_buffer) (ggml_backend_buffer_t buffer);
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void * (*get_base) (ggml_backend_buffer_t buffer); void * (*get_base) (ggml_backend_buffer_t buffer);
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
}; };
struct ggml_backend_buffer { struct ggml_backend_buffer {
@ -82,13 +82,13 @@ extern "C" {
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) asynchroneous tensor copy // (optional) asynchroneous tensor copy
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_from_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to_async) (ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
void (*synchronize)(ggml_backend_t backend); void (*synchronize)(ggml_backend_t backend);
// compute graph with a plan // compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

View file

@ -103,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
} }
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer)); return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
} }
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -115,17 +115,23 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
} }
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
} }
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage; buffer->usage = usage;
} }
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
return buffer->buft; return buffer->buft;
} }
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
if (buffer->iface.reset) {
buffer->iface.reset(buffer);
}
}
// backend // backend
const char * ggml_backend_name(ggml_backend_t backend) { const char * ggml_backend_name(ggml_backend_t backend) {
@ -431,13 +437,13 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
@ -457,6 +463,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
/* .clear = */ ggml_backend_cpu_buffer_clear, /* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
}; };
// for buffers from ptr, free is not called // for buffers from ptr, free is not called
@ -470,6 +477,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
/* .clear = */ ggml_backend_cpu_buffer_clear, /* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
}; };
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@ -554,7 +562,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
return NULL; return NULL;
} }
// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
@ -610,7 +617,7 @@ struct ggml_backend_plan_cpu {
struct ggml_cgraph cgraph; struct ggml_cgraph cgraph;
}; };
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@ -1371,14 +1378,10 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
sched_reset(sched); sched_reset(sched);
} }
void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
sched_split_graph(sched, graph); sched_split_graph(sched, graph);
}
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
ggml_backend_sched_graph_split(sched, graph);
sched_alloc_splits(sched); sched_alloc_splits(sched);
sched_compute_splits(sched); sched_compute_splits(sched);
sched_reset(sched); sched_reset(sched);
@ -1410,7 +1413,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->buffer == NULL);
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL); GGML_ASSERT(tensor->view_src->data != NULL);

View file

@ -40,7 +40,8 @@ extern "C" {
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// //
@ -166,11 +167,6 @@ extern "C" {
ggml_backend_sched_t sched, ggml_backend_sched_t sched,
struct ggml_cgraph * graph); struct ggml_cgraph * graph);
// Split without computing - only useful to find the number of splits
GGML_API void ggml_backend_sched_graph_split(
ggml_backend_sched_t sched,
struct ggml_cgraph * graph);
// //
// Utils // Utils
// //

View file

@ -9487,7 +9487,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} }
static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = { static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .get_name = */ ggml_backend_cuda_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .get_base = */ ggml_backend_cuda_buffer_get_base,
@ -9497,6 +9497,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = {
/* .cpy_tensor_from = */ NULL, /* .cpy_tensor_from = */ NULL,
/* .cpy_tensor_to = */ NULL, /* .cpy_tensor_to = */ NULL,
/* .clear = */ ggml_backend_cuda_buffer_clear, /* .clear = */ ggml_backend_cuda_buffer_clear,
/* .reset = */ NULL,
}; };
// cuda buffer type // cuda buffer type
@ -9528,7 +9529,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr); ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
return ggml_backend_buffer_init(buft, ggml_cuda_backend_buffer_interface, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
} }
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@ -9537,7 +9538,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty
UNUSED(buft); UNUSED(buft);
} }
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0; int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor); int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low; int64_t nrows_split = row_high - row_low;
@ -9574,7 +9575,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ nullptr, /* .is_host = */ NULL,
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@ -9583,7 +9584,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
return nullptr; return nullptr;
} }
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
static bool ggml_backend_cuda_buffer_type_initialized = false; static bool ggml_backend_cuda_buffer_type_initialized = false;
@ -9759,7 +9760,7 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
UNUSED(value); UNUSED(value);
} }
static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = { static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name, /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
@ -9769,6 +9770,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = {
/* .cpy_tensor_from = */ NULL, /* .cpy_tensor_from = */ NULL,
/* .cpy_tensor_to = */ NULL, /* .cpy_tensor_to = */ NULL,
/* .clear = */ ggml_backend_cuda_split_buffer_clear, /* .clear = */ ggml_backend_cuda_split_buffer_clear,
/* .reset = */ NULL,
}; };
// cuda split buffer type // cuda split buffer type
@ -9786,7 +9788,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
return ggml_backend_buffer_init(buft, ggml_cuda_backend_split_buffer_interface, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
} }
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@ -9795,7 +9797,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
UNUSED(buft); UNUSED(buft);
} }
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
size_t total_size = 0; size_t total_size = 0;
@ -9903,7 +9905,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
} }
// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
@ -9975,29 +9976,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
UNUSED(backend); UNUSED(backend);
} }
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
GGML_ASSERT(!"not implemented");
return nullptr;
UNUSED(backend);
UNUSED(cgraph);
}
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(!"not implemented");
UNUSED(backend);
UNUSED(plan);
}
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(!"not implemented");
UNUSED(backend);
UNUSED(plan);
}
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
@ -10149,7 +10127,7 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
UNUSED(backend); UNUSED(backend);
} }
static ggml_backend_i cuda_backend_i = { static ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_name, /* .get_name = */ ggml_backend_cuda_name,
/* .free = */ ggml_backend_cuda_free, /* .free = */ ggml_backend_cuda_free,
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
@ -10158,9 +10136,9 @@ static ggml_backend_i cuda_backend_i = {
/* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_from_async = */ NULL,
/* .cpy_tensor_to_async = */ NULL, /* .cpy_tensor_to_async = */ NULL,
/* .synchronize = */ ggml_backend_cuda_synchronize, /* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, /* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op, /* .supports_op = */ ggml_backend_cuda_supports_op,
}; };
@ -10182,7 +10160,7 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
}; };
ggml_backend_t cuda_backend = new ggml_backend { ggml_backend_t cuda_backend = new ggml_backend {
/* .interface = */ cuda_backend_i, /* .interface = */ ggml_backend_cuda_interface,
/* .context = */ ctx /* .context = */ ctx
}; };

View file

@ -2481,13 +2481,13 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
UNUSED(buffer); UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
UNUSED(buffer); UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
UNUSED(buffer); UNUSED(buffer);
@ -2509,6 +2509,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
/* .clear = */ ggml_backend_metal_buffer_clear, /* .clear = */ ggml_backend_metal_buffer_clear,
/* .reset = */ NULL,
}; };
// default buffer type // default buffer type
@ -2715,7 +2716,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct
UNUSED(backend); UNUSED(backend);
} }
static struct ggml_backend_i metal_backend_i = { static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name, /* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free, /* .free = */ ggml_backend_metal_free,
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
@ -2741,7 +2742,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
*metal_backend = (struct ggml_backend) { *metal_backend = (struct ggml_backend) {
/* .interface = */ metal_backend_i, /* .interface = */ ggml_backend_metal_i,
/* .context = */ ctx, /* .context = */ ctx,
}; };

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-opencl.h" #include "ggml-opencl.h"
#include "ggml-backend-impl.h"
#include <array> #include <array>
#include <atomic> #include <atomic>
@ -10,7 +11,7 @@
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#define CL_TARGET_OPENCL_VERSION 110 #define CL_TARGET_OPENCL_VERSION 120
#include <clblast.h> #include <clblast.h>
#if defined(_MSC_VER) #if defined(_MSC_VER)
@ -929,6 +930,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
} }
void ggml_cl_init(void) { void ggml_cl_init(void) {
static bool initialized = false;
if (initialized) {
return;
}
cl_int err; cl_int err;
struct cl_device; struct cl_device;
@ -1483,8 +1489,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0; size_t x_offset = 0;
@ -1501,7 +1507,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// copy src1 to device // copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); if (src1->backend == GGML_BACKEND_CPU) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
}
CL_CHECK(clFinish(queue)); CL_CHECK(clFinish(queue));
@ -1522,8 +1530,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (dst->backend == GGML_BACKEND_CPU) {
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
} }
} }
} }
@ -1532,8 +1542,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend != GGML_BACKEND_GPU) { if (src0->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_X, x_size); ggml_cl_pool_free(d_X, x_size);
} }
ggml_cl_pool_free(d_Y, y_size); if (src1->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_D, d_size); ggml_cl_pool_free(d_Y, y_size);
}
if (dst->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_D, d_size);
}
} }
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@ -1598,6 +1612,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
} }
// FIXME: convert on device
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// convert src1 to fp16 // convert src1 to fp16
// TODO: use multiple threads // TODO: use multiple threads
@ -1643,11 +1659,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy dst to host, then convert to float // copy dst to host, then convert to float
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); if (dst->backend == GGML_BACKEND_CPU) {
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
ggml_fp16_to_fp32_row(tmp, d, d_ne); } else {
// FIXME: convert dst to fp32 on device
}
} }
} }
} }
@ -1801,7 +1819,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} }
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
@ -1895,3 +1913,292 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
tensor->extra = dst; tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
} }
// ggml-backend
// buffer
struct ggml_backend_opencl_buffer_context {
~ggml_backend_opencl_buffer_context() {
if (buffer) {
clReleaseMemObject(buffer);
}
for (auto * sub_buffer : sub_buffers) {
clReleaseMemObject(sub_buffer);
}
}
cl_mem buffer;
std::vector<cl_mem> sub_buffers;
};
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
return "OpenCL";
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
}
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
return cl_ptr_base;
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
tensor->extra = tensor->view_src->extra;
} else {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
cl_int err;
cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
ctx->sub_buffers.push_back(sub_buffer);
tensor->extra = sub_buffer;
}
tensor->backend = GGML_BACKEND_GPU;
}
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
cl_mem tensor_buffer = (cl_mem) tensor->extra;
CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
cl_mem tensor_buffer = (cl_mem) tensor->extra;
CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
}
static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
for (auto * sub_buffer : ctx->sub_buffers) {
clReleaseMemObject(sub_buffer);
}
ctx->sub_buffers.clear();
}
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_get_name,
/* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
/* .get_base = */ ggml_backend_opencl_buffer_get_base,
/* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
/* .cpy_tensor_from = */ NULL,
/* .cpy_tensor_to = */ NULL,
/* .clear = */ ggml_backend_opencl_buffer_clear,
/* .reset = */ ggml_backend_opencl_buffer_reset,
};
// buffer type
static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
return "OpenCL";
GGML_UNUSED(buffer_type);
}
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
ggml_cl_init();
cl_int err;
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
if (err != CL_SUCCESS) {
fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
return nullptr;
}
ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
}
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
// FIXME: not thread safe, device may not be initialized yet
static cl_uint alignment = -1;
if (alignment == (cl_uint)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
}
return alignment;
GGML_UNUSED(buffer_type);
}
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buffer_type);
}
static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
static ggml_backend_buffer_type buffer_type = {
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
/* .context = */ nullptr,
};
return &buffer_type;
}
#if 0
// host buffer type
static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return "CL_Host";
GGML_UNUSED(buft);
}
static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
return "CL_Host";
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_cl_host_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr = ggml_cl_host_malloc(size);
if (ptr == nullptr) {
// fallback to cpu buffer
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
/* .iface = */ {
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
};
return &ggml_backend_opencl_buffer_type_host;
}
// backend
static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
return "OpenCL";
GGML_UNUSED(backend);
}
static void ggml_backend_opencl_free(ggml_backend_t backend) {
GGML_UNUSED(backend);
}
static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_opencl_buffer_type();
GGML_UNUSED(backend);
}
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
break;
case GGML_OP_MUL:
ggml_cl_mul(node->src[0], node->src[1], node);
break;
default:
GGML_ASSERT(false);
}
}
return true;
GGML_UNUSED(backend);
}
static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_MUL_MAT:
return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
case GGML_OP_MUL:
// return ggml_can_repeat_rows(op->src[1], op->src[0]);
return true;
default:
return false;
}
GGML_UNUSED(backend);
}
static ggml_backend_i opencl_backend_i = {
/* .get_name = */ ggml_backend_opencl_name,
/* .free = */ ggml_backend_opencl_free,
/* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_from_async = */ NULL,
/* .cpy_tensor_to_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
/* .supports_op = */ ggml_backend_opencl_supports_op,
};
ggml_backend_t ggml_backend_opencl_init() {
ggml_backend_t backend = new ggml_backend {
/* .interface = */ opencl_backend_i,
/* .context = */ nullptr
};
return backend;
}
bool ggml_backend_is_opencl(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_opencl_name;
}
#endif

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -9,17 +10,26 @@ extern "C" {
GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
GGML_API void * ggml_cl_host_malloc(size_t size); // GGML_API void * ggml_cl_host_malloc(size_t size);
GGML_API void ggml_cl_host_free(void * ptr); // GGML_API void ggml_cl_host_free(void * ptr);
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
// backend API
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

2
ggml.c
View file

@ -16601,7 +16601,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
return GGML_EXIT_SUCCESS; return GGML_EXIT_SUCCESS;
} }
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
if (n_threads <= 0) { if (n_threads <= 0) {
n_threads = GGML_DEFAULT_N_THREADS; n_threads = GGML_DEFAULT_N_THREADS;
} }

4
ggml.h
View file

@ -1852,8 +1852,8 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data

View file

@ -1204,8 +1204,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
// in some cases such as the KV cache, there is no benefit to using a host buffer, // host buffers should only be used when data is expected to be copied to/from the GPU
// since the data is never copied to the GPU
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
} }
@ -1228,6 +1227,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUBLAS)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#endif #endif
if (buft == nullptr) { if (buft == nullptr) {
@ -1695,6 +1696,10 @@ static bool llama_kv_cache_init(
cache.cells.clear(); cache.cells.clear();
cache.cells.resize(n_ctx); cache.cells.resize(n_ctx);
#ifdef GGML_USE_CLBLAST
offload = false;
#endif
// count used buffer types // count used buffer types
std::map<ggml_backend_buffer_type_t, int> buft_layer_count; std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
if (offload) { if (offload) {
@ -1702,9 +1707,10 @@ static bool llama_kv_cache_init(
buft_layer_count[model.buft_layer[i].buft]++; buft_layer_count[model.buft_layer[i].buft]++;
} }
} else { } else {
buft_layer_count[llama_default_buffer_type_cpu(false)] = n_layer; buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
} }
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) { for (auto & it : buft_layer_count) {
int n_layers = it.second; int n_layers = it.second;
@ -2413,26 +2419,12 @@ struct llama_model_loader {
} }
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
/*
// prefetch only CPU tensors
if (use_mmap) {
size_t size_pref = 0; // prefetch
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
if (cur->backend == buffer_type_cpu) {
size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
size_pref = std::max(size_pref, tensor_end);
}
}
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
}
*/
// prefetch the whole file - all the data is needed anyway // prefetch the whole file - all the data is needed anyway
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
} }
// compute the total size of all tensors for progress reporting
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
size_data += ggml_nbytes(cur); size_data += ggml_nbytes(cur);
@ -3173,16 +3165,15 @@ static bool llm_load_tensors(
model.main_gpu = main_gpu; model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
// there is very little benefit to offloading the input layer, so always keep it on the CPU
// there is very little benefit to offloading the input layers, so always keep it on the CPU
model.buft_input = llama_default_buffer_type_cpu(true); model.buft_input = llama_default_buffer_type_cpu(true);
model.buft_layer.resize(n_layer); model.buft_layer.resize(n_layer);
// cpu layers
// assign cpu layers
for (int64_t i = 0; i < i_gpu_start; ++i) { for (int64_t i = 0; i < i_gpu_start; ++i) {
model.buft_layer[i] = llama_default_buffer_type_cpu(true); model.buft_layer[i] = llama_default_buffer_type_cpu(true);
} }
@ -3191,11 +3182,8 @@ static bool llm_load_tensors(
if (split_mode == LLAMA_SPLIT_LAYER) { if (split_mode == LLAMA_SPLIT_LAYER) {
// calculate the split points // calculate the split points
int device_count = ggml_backend_cuda_get_device_count(); int device_count = ggml_backend_cuda_get_device_count();
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
float splits[GGML_CUDA_MAX_DEVICES]; float splits[GGML_CUDA_MAX_DEVICES];
if (tensor_split != nullptr) {
std::copy(tensor_split, tensor_split + device_count, splits);
}
bool all_zero = tensor_split == nullptr || std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; });
if (all_zero) { if (all_zero) {
// default split, by free memory // default split, by free memory
for (int i = 0; i < device_count; ++i) { for (int i = 0; i < device_count; ++i) {
@ -3204,7 +3192,11 @@ static bool llm_load_tensors(
ggml_backend_cuda_get_device_memory(i, &total, &free); ggml_backend_cuda_get_device_memory(i, &total, &free);
splits[i] = free; splits[i] = free;
} }
} else {
std::copy(tensor_split, tensor_split + device_count, splits);
} }
// sum and normalize the splits to get the split points
float split_sum = 0.0f; float split_sum = 0.0f;
for (int i = 0; i < device_count; ++i) { for (int i = 0; i < device_count; ++i) {
split_sum += splits[i]; split_sum += splits[i];
@ -3214,15 +3206,15 @@ static bool llm_load_tensors(
splits[i] /= split_sum; splits[i] /= split_sum;
} }
// assign GPU layers according to the splits to the devices // assign the repeating layers to the devices according to the splits
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
for (int64_t i = i_gpu_start; i < n_layer; ++i) { for (int64_t i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
} }
// output layer // assign the output layer
if (n_gpu_layers > n_layer) { if (n_gpu_layers > n_layer) {
int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits; int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
model.buft_output = llama_default_buffer_type_offload(layer_gpu); model.buft_output = llama_default_buffer_type_offload(layer_gpu);
} else { } else {
model.buft_output = llama_default_buffer_type_cpu(true); model.buft_output = llama_default_buffer_type_cpu(true);
@ -3234,16 +3226,17 @@ static bool llm_load_tensors(
if (split_mode == LLAMA_SPLIT_ROW) { if (split_mode == LLAMA_SPLIT_ROW) {
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
} else { } else {
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(main_gpu); split_buft = llama_default_buffer_type_offload(main_gpu);
} }
// repeating layers // assign the repeating layers
for (int64_t i = i_gpu_start; i < n_layer; ++i) { for (int64_t i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = { model.buft_layer[i] = {
split_buft, split_buft,
llama_default_buffer_type_offload(main_gpu) llama_default_buffer_type_offload(main_gpu)
}; };
} }
// output layer // assign the output layer
if (n_gpu_layers > n_layer) { if (n_gpu_layers > n_layer) {
model.buft_output = { model.buft_output = {
split_buft, split_buft,
@ -3266,6 +3259,7 @@ static bool llm_load_tensors(
} }
// create one context per buffer type // create one context per buffer type
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) { for (auto & it : buft_layer_count) {
struct ggml_init_params params = { struct ggml_init_params params = {
@ -3289,14 +3283,11 @@ static bool llm_load_tensors(
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;
const int64_t n_ff = hparams.n_ff; const int64_t n_ff = hparams.n_ff;
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
// FIXME: for metal, it may be better to put the input on the GPU context - however it may make no difference in practice,
// and it would increase the metal buffer size
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@ -3617,8 +3608,8 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}); layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
@ -3736,8 +3727,7 @@ static bool llm_load_tensors(
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
// create backend buffers // create the backend buffers
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs; std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
for (auto & it : ctx_map) { for (auto & it : ctx_map) {
@ -3745,8 +3735,9 @@ static bool llm_load_tensors(
ggml_context * ctx = it.second; ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = nullptr; ggml_backend_buffer_t buf = nullptr;
// only the region containing the tensors in the model is mapped to the backend buffer // only the mmap region containing the tensors in the model is mapped to the backend buffer
// this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
size_t first, last; size_t first, last;
ml.get_mapping_range(&first, &last, ctx); ml.get_mapping_range(&first, &last, ctx);
@ -3771,7 +3762,7 @@ static bool llm_load_tensors(
throw std::runtime_error("failed to allocate buffer"); throw std::runtime_error("failed to allocate buffer");
} }
// indicate that this buffer contains weights // indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(buf); model.bufs.push_back(buf);
ctx_bufs.emplace_back(ctx, buf); ctx_bufs.emplace_back(ctx, buf);
@ -3803,7 +3794,7 @@ static bool llm_load_tensors(
} }
} }
// load data // load tensor data
for (auto & it : ctx_bufs) { for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first; ggml_context * ctx = it.first;
ggml_backend_buffer_t buf = it.second; ggml_backend_buffer_t buf = it.second;
@ -3849,7 +3840,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
return -2; return -2;
} }
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1; return -1;
} }