move BLAS to a separate backend (#6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1c641e6aac
commit
f578b86b21
17 changed files with 821 additions and 379 deletions
|
@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
if (!ggml_backend_is_vk(backend)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
return buft_ctx->ctx->idx == ctx->idx;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
|
@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|||
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
|
@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
return buft_ctx->ctx->idx == ctx->idx;
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
|
@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue