opencl : add ggml-backend buffer type
This commit is contained in:
parent
4ed5f621be
commit
fa7620116e
11 changed files with 419 additions and 132 deletions
|
@ -229,6 +229,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||
} else {
|
||||
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
||||
ggml_backend_buffer_reset(alloc->buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ extern "C" {
|
|||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||
// check if tensor data is in host memory
|
||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||
|
@ -37,15 +37,15 @@ extern "C" {
|
|||
struct ggml_backend_buffer_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
|
@ -82,13 +82,13 @@ extern "C" {
|
|||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// (optional) asynchroneous tensor copy
|
||||
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_from_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to_async) (ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
|
|
|
@ -103,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
|
||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
|
@ -115,17 +115,23 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|||
}
|
||||
|
||||
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||
buffer->usage = usage;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||
return buffer->buft;
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.reset) {
|
||||
buffer->iface.reset(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
// backend
|
||||
|
||||
const char * ggml_backend_name(ggml_backend_t backend) {
|
||||
|
@ -431,13 +437,13 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
|||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
|
@ -457,6 +463,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// for buffers from ptr, free is not called
|
||||
|
@ -470,6 +477,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||
|
@ -554,7 +562,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
||||
|
@ -610,7 +617,7 @@ struct ggml_backend_plan_cpu {
|
|||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||
|
@ -1371,14 +1378,10 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|||
sched_reset(sched);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||
|
||||
sched_split_graph(sched, graph);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
ggml_backend_sched_graph_split(sched, graph);
|
||||
sched_alloc_splits(sched);
|
||||
sched_compute_splits(sched);
|
||||
sched_reset(sched);
|
||||
|
@ -1410,7 +1413,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|||
|
||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->buffer == NULL);
|
||||
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
||||
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
||||
GGML_ASSERT(tensor->view_src != NULL);
|
||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||
|
|
|
@ -40,7 +40,8 @@ extern "C" {
|
|||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
|
||||
//
|
||||
|
@ -166,11 +167,6 @@ extern "C" {
|
|||
ggml_backend_sched_t sched,
|
||||
struct ggml_cgraph * graph);
|
||||
|
||||
// Split without computing - only useful to find the number of splits
|
||||
GGML_API void ggml_backend_sched_graph_split(
|
||||
ggml_backend_sched_t sched,
|
||||
struct ggml_cgraph * graph);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
|
52
ggml-cuda.cu
52
ggml-cuda.cu
|
@ -9487,7 +9487,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = {
|
||||
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||
|
@ -9497,6 +9497,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = {
|
|||
/* .cpy_tensor_from = */ NULL,
|
||||
/* .cpy_tensor_to = */ NULL,
|
||||
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// cuda buffer type
|
||||
|
@ -9528,7 +9529,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|||
|
||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_cuda_backend_buffer_interface, ctx, size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
|
@ -9537,7 +9538,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty
|
|||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
@ -9574,7 +9575,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||
/* .is_host = */ nullptr,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
|
@ -9583,7 +9584,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
||||
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
||||
|
||||
|
@ -9759,7 +9760,7 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
|||
UNUSED(value);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = {
|
||||
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||
|
@ -9769,6 +9770,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = {
|
|||
/* .cpy_tensor_from = */ NULL,
|
||||
/* .cpy_tensor_to = */ NULL,
|
||||
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// cuda split buffer type
|
||||
|
@ -9786,7 +9788,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
|
|||
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
||||
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_cuda_backend_split_buffer_interface, ctx, size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
|
@ -9795,7 +9797,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
|||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
|
||||
size_t total_size = 0;
|
||||
|
@ -9903,7 +9905,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
||||
|
@ -9975,29 +9976,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
return nullptr;
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(plan);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(plan);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
|
||||
|
@ -10149,7 +10127,7 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_i cuda_backend_i = {
|
||||
static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
||||
|
@ -10158,9 +10136,9 @@ static ggml_backend_i cuda_backend_i = {
|
|||
/* .cpy_tensor_from_async = */ NULL,
|
||||
/* .cpy_tensor_to_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
||||
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||
};
|
||||
|
@ -10182,7 +10160,7 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
};
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .interface = */ cuda_backend_i,
|
||||
/* .interface = */ ggml_backend_cuda_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
|
|
|
@ -2481,13 +2481,13 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
|
|||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
||||
|
||||
UNUSED(buffer);
|
||||
|
@ -2509,6 +2509,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|||
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
||||
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
||||
/* .clear = */ ggml_backend_metal_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// default buffer type
|
||||
|
@ -2715,7 +2716,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i metal_backend_i = {
|
||||
static struct ggml_backend_i ggml_backend_metal_i = {
|
||||
/* .get_name = */ ggml_backend_metal_name,
|
||||
/* .free = */ ggml_backend_metal_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
||||
|
@ -2741,7 +2742,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|||
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
||||
|
||||
*metal_backend = (struct ggml_backend) {
|
||||
/* .interface = */ metal_backend_i,
|
||||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
|
|
335
ggml-opencl.cpp
335
ggml-opencl.cpp
|
@ -1,5 +1,6 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-opencl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
|
@ -10,7 +11,7 @@
|
|||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#define CL_TARGET_OPENCL_VERSION 120
|
||||
#include <clblast.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
@ -929,6 +930,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|||
}
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
static bool initialized = false;
|
||||
if (initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
struct cl_device;
|
||||
|
@ -1483,8 +1489,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
size_t x_offset = 0;
|
||||
|
||||
|
@ -1501,7 +1507,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
}
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
|
@ -1522,8 +1530,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1532,8 +1542,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
if (src1->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
}
|
||||
if (dst->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||
|
@ -1598,6 +1612,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
|
||||
// FIXME: convert on device
|
||||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
|
@ -1643,11 +1659,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
} else {
|
||||
// FIXME: convert dst to fp32 on device
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1801,7 +1819,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
}
|
||||
|
||||
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
@ -1895,3 +1913,292 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|||
tensor->extra = dst;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
}
|
||||
|
||||
// ggml-backend
|
||||
|
||||
// buffer
|
||||
|
||||
struct ggml_backend_opencl_buffer_context {
|
||||
~ggml_backend_opencl_buffer_context() {
|
||||
if (buffer) {
|
||||
clReleaseMemObject(buffer);
|
||||
}
|
||||
for (auto * sub_buffer : sub_buffers) {
|
||||
clReleaseMemObject(sub_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
cl_mem buffer;
|
||||
std::vector<cl_mem> sub_buffers;
|
||||
};
|
||||
|
||||
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
||||
|
||||
static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "OpenCL";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return cl_ptr_base;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
tensor->extra = tensor->view_src->extra;
|
||||
} else {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
|
||||
cl_int err;
|
||||
cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
ctx->sub_buffers.push_back(sub_buffer);
|
||||
tensor->extra = sub_buffer;
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
cl_mem tensor_buffer = (cl_mem) tensor->extra;
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
cl_mem tensor_buffer = (cl_mem) tensor->extra;
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
for (auto * sub_buffer : ctx->sub_buffers) {
|
||||
clReleaseMemObject(sub_buffer);
|
||||
}
|
||||
ctx->sub_buffers.clear();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_opencl_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_opencl_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
|
||||
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
|
||||
/* .cpy_tensor_from = */ NULL,
|
||||
/* .cpy_tensor_to = */ NULL,
|
||||
/* .clear = */ ggml_backend_opencl_buffer_clear,
|
||||
/* .reset = */ ggml_backend_opencl_buffer_reset,
|
||||
};
|
||||
|
||||
// buffer type
|
||||
|
||||
static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
|
||||
return "OpenCL";
|
||||
|
||||
GGML_UNUSED(buffer_type);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
|
||||
ggml_cl_init();
|
||||
|
||||
cl_int err;
|
||||
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
|
||||
|
||||
return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
||||
// FIXME: not thread safe, device may not be initialized yet
|
||||
static cl_uint alignment = -1;
|
||||
if (alignment == (cl_uint)-1) {
|
||||
ggml_cl_init();
|
||||
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
||||
}
|
||||
return alignment;
|
||||
|
||||
GGML_UNUSED(buffer_type);
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
|
||||
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
|
||||
return ggml_backend_is_cpu(backend);
|
||||
|
||||
GGML_UNUSED(buffer_type);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ NULL,
|
||||
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
|
||||
static ggml_backend_buffer_type buffer_type = {
|
||||
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &buffer_type;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// host buffer type
|
||||
|
||||
static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CL_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CL_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_cl_host_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_cl_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_opencl_buffer_type_host;
|
||||
}
|
||||
|
||||
// backend
|
||||
|
||||
static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
||||
return "OpenCL";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_opencl_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
ggml_tensor * node = graph->nodes[i];
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
ggml_cl_mul(node->src[0], node->src[1], node);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
|
||||
case GGML_OP_MUL:
|
||||
// return ggml_can_repeat_rows(op->src[1], op->src[0]);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_i opencl_backend_i = {
|
||||
/* .get_name = */ ggml_backend_opencl_name,
|
||||
/* .free = */ ggml_backend_opencl_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_from_async = */ NULL,
|
||||
/* .cpy_tensor_to_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_opencl_supports_op,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_opencl_init() {
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .interface = */ opencl_backend_i,
|
||||
/* .context = */ nullptr
|
||||
};
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_opencl(ggml_backend_t backend) {
|
||||
return backend && backend->iface.get_name == ggml_backend_opencl_name;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -9,17 +10,26 @@ extern "C" {
|
|||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||
GGML_API void ggml_cl_host_free(void * ptr);
|
||||
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||
// GGML_API void ggml_cl_host_free(void * ptr);
|
||||
|
||||
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
|
||||
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
// backend API
|
||||
|
||||
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||
|
||||
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
2
ggml.c
2
ggml.c
|
@ -16601,7 +16601,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
return GGML_EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
||||
if (n_threads <= 0) {
|
||||
n_threads = GGML_DEFAULT_N_THREADS;
|
||||
}
|
||||
|
|
4
ggml.h
4
ggml.h
|
@ -1852,8 +1852,8 @@ extern "C" {
|
|||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
|
|
79
llama.cpp
79
llama.cpp
|
@ -1204,8 +1204,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// in some cases such as the KV cache, there is no benefit to using a host buffer,
|
||||
// since the data is never copied to the GPU
|
||||
// host buffers should only be used when data is expected to be copied to/from the GPU
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
|
@ -1228,6 +1227,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
|
@ -1695,6 +1696,10 @@ static bool llama_kv_cache_init(
|
|||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
offload = false;
|
||||
#endif
|
||||
|
||||
// count used buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
if (offload) {
|
||||
|
@ -1702,9 +1707,10 @@ static bool llama_kv_cache_init(
|
|||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
}
|
||||
} else {
|
||||
buft_layer_count[llama_default_buffer_type_cpu(false)] = n_layer;
|
||||
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
||||
}
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
int n_layers = it.second;
|
||||
|
@ -2413,26 +2419,12 @@ struct llama_model_loader {
|
|||
}
|
||||
|
||||
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
|
||||
/*
|
||||
// prefetch only CPU tensors
|
||||
if (use_mmap) {
|
||||
size_t size_pref = 0; // prefetch
|
||||
|
||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
||||
if (cur->backend == buffer_type_cpu) {
|
||||
size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
|
||||
size_pref = std::max(size_pref, tensor_end);
|
||||
}
|
||||
}
|
||||
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
|
||||
}
|
||||
*/
|
||||
// prefetch the whole file - all the data is needed anyway
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
||||
}
|
||||
|
||||
// compute the total size of all tensors for progress reporting
|
||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
||||
size_data += ggml_nbytes(cur);
|
||||
|
@ -3173,16 +3165,15 @@ static bool llm_load_tensors(
|
|||
model.main_gpu = main_gpu;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
||||
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
||||
|
||||
|
||||
// there is very little benefit to offloading the input layers, so always keep it on the CPU
|
||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||
model.buft_input = llama_default_buffer_type_cpu(true);
|
||||
|
||||
model.buft_layer.resize(n_layer);
|
||||
// cpu layers
|
||||
|
||||
// assign cpu layers
|
||||
for (int64_t i = 0; i < i_gpu_start; ++i) {
|
||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
|
@ -3191,11 +3182,8 @@ static bool llm_load_tensors(
|
|||
if (split_mode == LLAMA_SPLIT_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = ggml_backend_cuda_get_device_count();
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
float splits[GGML_CUDA_MAX_DEVICES];
|
||||
if (tensor_split != nullptr) {
|
||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
||||
}
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; });
|
||||
if (all_zero) {
|
||||
// default split, by free memory
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
|
@ -3204,7 +3192,11 @@ static bool llm_load_tensors(
|
|||
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
||||
splits[i] = free;
|
||||
}
|
||||
} else {
|
||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
||||
}
|
||||
|
||||
// sum and normalize the splits to get the split points
|
||||
float split_sum = 0.0f;
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
split_sum += splits[i];
|
||||
|
@ -3214,15 +3206,15 @@ static bool llm_load_tensors(
|
|||
splits[i] /= split_sum;
|
||||
}
|
||||
|
||||
// assign GPU layers according to the splits to the devices
|
||||
// assign the repeating layers to the devices according to the splits
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
||||
}
|
||||
// output layer
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits;
|
||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
|
@ -3234,16 +3226,17 @@ static bool llm_load_tensors(
|
|||
if (split_mode == LLAMA_SPLIT_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
||||
} else {
|
||||
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(main_gpu);
|
||||
}
|
||||
// repeating layers
|
||||
// assign the repeating layers
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
model.buft_layer[i] = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(main_gpu)
|
||||
};
|
||||
}
|
||||
// output layer
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
|
@ -3266,6 +3259,7 @@ static bool llm_load_tensors(
|
|||
}
|
||||
|
||||
// create one context per buffer type
|
||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
struct ggml_init_params params = {
|
||||
|
@ -3289,14 +3283,11 @@ static bool llm_load_tensors(
|
|||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_ff = hparams.n_ff;
|
||||
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
// FIXME: for metal, it may be better to put the input on the GPU context - however it may make no difference in practice,
|
||||
// and it would increase the metal buffer size
|
||||
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
||||
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
||||
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
||||
|
@ -3617,8 +3608,8 @@ static bool llm_load_tensors(
|
|||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3});
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
@ -3736,8 +3727,7 @@ static bool llm_load_tensors(
|
|||
|
||||
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
|
||||
|
||||
// create backend buffers
|
||||
|
||||
// create the backend buffers
|
||||
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
|
||||
|
||||
for (auto & it : ctx_map) {
|
||||
|
@ -3745,8 +3735,9 @@ static bool llm_load_tensors(
|
|||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = nullptr;
|
||||
|
||||
// only the region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, ctx);
|
||||
|
@ -3771,7 +3762,7 @@ static bool llm_load_tensors(
|
|||
throw std::runtime_error("failed to allocate buffer");
|
||||
}
|
||||
// indicate that this buffer contains weights
|
||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight
|
||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
model.bufs.push_back(buf);
|
||||
ctx_bufs.emplace_back(ctx, buf);
|
||||
|
@ -3803,7 +3794,7 @@ static bool llm_load_tensors(
|
|||
}
|
||||
}
|
||||
|
||||
// load data
|
||||
// load tensor data
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
ggml_backend_buffer_t buf = it.second;
|
||||
|
@ -3849,7 +3840,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|||
return -2;
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue