move BLAS to a separate backend

This commit is contained in:
slaren 2024-03-20 18:45:05 +01:00
parent b90dc566c1
commit 7f58793c56
16 changed files with 590 additions and 339 deletions

View file

@ -311,9 +311,9 @@ if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
endif() endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8) # set(BLA_SIZEOF_INTEGER 8)
endif() #endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS) find_package(BLAS)
@ -380,6 +380,9 @@ if (LLAMA_BLAS)
add_compile_definitions(GGML_BLAS_USE_MKL) add_compile_definitions(GGML_BLAS_USE_MKL)
endif() endif()
set(GGML_HEADERS_BLAS ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.c)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else() else()
@ -1255,6 +1258,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
) )

View file

@ -408,6 +408,7 @@ ifndef LLAMA_NO_ACCELERATE
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
MK_LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
OBJS += ggml-blas.o
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
@ -421,23 +422,35 @@ ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas)
OBJS += ggml-blas.o
endif # LLAMA_OPENBLAS endif # LLAMA_OPENBLAS
ifdef LLAMA_OPENBLAS64
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
OBJS += ggml-blas.o
endif # LLAMA_OPENBLAS64
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
OBJS += ggml-blas.o
endif # LLAMA_BLIS
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o OBJS += sgemm.o
endif endif
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_RPC ifdef LLAMA_RPC
MK_CPPFLAGS += -DGGML_USE_RPC MK_CPPFLAGS += -DGGML_USE_RPC
OBJS += ggml-rpc.o OBJS += ggml-rpc.o
endif # LLAMA_RPC endif # LLAMA_RPC
ggml-blas.o: ggml-blas.c ggml-blas.h
$(CC) $(CFLAGS) -c $< -o $@
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future # LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1 LLAMA_CUDA := 1

View file

@ -339,6 +339,7 @@ struct hash_node {
}; };
struct tensor_alloc { struct tensor_alloc {
int buffer_id;
size_t offset; size_t offset;
size_t size_max; // 0 = pre-allocated, unused, or view size_t size_max; // 0 = pre-allocated, unused, or view
}; };
@ -349,7 +350,6 @@ struct leaf_alloc {
}; };
struct node_alloc { struct node_alloc {
int buffer_id;
struct tensor_alloc dst; struct tensor_alloc dst;
struct tensor_alloc src[GGML_MAX_SRC]; struct tensor_alloc src[GGML_MAX_SRC];
}; };
@ -511,17 +511,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
} }
} }
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
// graph outputs are never freed // graph outputs are never freed
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
AT_PRINTF("not freeing output %s\n", node->name); AT_PRINTF("not freeing output %s\n", node->name);
return; return;
} }
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset; size_t offset = hn->offset;
int buffer_id = hn->buffer_id;
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node); size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
hn->allocated = false; hn->allocated = false;
@ -626,11 +627,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
AT_PRINTF("view_src %s: %d children, %d views\n", AT_PRINTF("view_src %s: %d children, %d views\n",
view_src->name, view_src_hn->n_children, view_src_hn->n_views); view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
ggml_gallocr_free_node(galloc, view_src, buffer_id); ggml_gallocr_free_node(galloc, view_src);
} }
} }
else if (p_hn->allocated) { else if (p_hn->allocated) {
ggml_gallocr_free_node(galloc, parent, buffer_id); ggml_gallocr_free_node(galloc, parent);
} }
} }
AT_PRINTF("\n"); AT_PRINTF("\n");
@ -674,22 +675,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i];
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
if (node->view_src || node->data) { if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.size_max = 0; node_alloc->dst.size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.offset = hn->offset; node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); node_alloc->dst.offset = hn->offset;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) { if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].size_max = 0; node_alloc->src[j].size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset; node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
} }
@ -706,9 +711,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id; galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) { if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.size_max = 0; galloc->leaf_allocs[i].leaf.size_max = 0;
} else { } else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
} }
@ -740,7 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
} }
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
@ -768,8 +776,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
} }
} }
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
return talloc->size_max >= node_size; return talloc->size_max >= node_size;
} }
@ -793,7 +801,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
#endif #endif
@ -805,7 +813,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif #endif
@ -846,7 +854,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
} }
// nodes // nodes
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
@ -857,9 +865,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
} }
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
} }
return true; return true;

View file

@ -17,13 +17,15 @@ extern "C" {
struct ggml_backend_buffer_type_i { struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding // max buffer size that can be allocated
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
// data size needed to allocate the tensor, including padding
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// check if tensor data is in host memory // check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
}; };
@ -92,27 +94,37 @@ extern "C" {
void (*GGML_CALL synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently) // compute graph with a plan (not used currently)
// create a new plan for a graph
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
// compute the graph with the plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async) // compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation // check if the backend can compute an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend can use tensors allocated in a buffer type
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend // these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily // even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization // (optional) event synchronization
// create a new event that can record events on this backend instance
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event); void (*GGML_CALL event_free) (ggml_backend_event_t event);
// record an event on the backend instance that created it
void (*GGML_CALL event_record) (ggml_backend_event_t event); void (*GGML_CALL event_record) (ggml_backend_event_t event);
// wait for an event on on a different backend instance
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// block until an event is recorded
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
}; };

View file

@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
} }
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return buft->iface.supports_backend(buft, backend);
}
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
if (buft->iface.is_host) { if (buft->iface.is_host) {
return buft->iface.is_host(buft); return buft->iface.is_host(buft);
@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
return backend->iface.supports_op(backend, op); return backend->iface.supports_op(backend, op);
} }
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return backend->iface.supports_buft(backend, buft);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) { if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op); return backend->iface.offload_op(backend, op);
@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(backend);
}
static struct ggml_backend_i cpu_backend_i = { static struct ggml_backend_i cpu_backend_i = {
/* .get_name = */ ggml_backend_cpu_name, /* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free, /* .free = */ ggml_backend_cpu_free,
@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op, /* .supports_op = */ ggml_backend_cpu_supports_op,
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,
@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
int * node_backend_ids; // [graph_size] int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size]
int * prev_node_backend_ids; // [graph_size]
int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs // copy of the graph with modified inputs
struct ggml_cgraph * graph; struct ggml_cgraph * graph;
@ -1097,15 +1100,16 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
return -1; return -1;
} }
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
ggml_backend_buffer_t buffer = tensor->buffer; ggml_backend_buffer_t buffer = tensor->buffer;
if (buffer == NULL) { if (buffer == NULL) {
return -1; return -1;
} }
// find highest prio backend that supports the buffer type // find highest prio backend that supports the buffer type and the op
for (int i = 0; i < sched->n_backends; i++) { for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
ggml_backend_supports_op(sched->backends[i], op)) {
return i; return i;
} }
} }
@ -1126,12 +1130,17 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
#define GET_CAUSE(node) "" #define GET_CAUSE(node) ""
#endif #endif
//#define DEBUG_PASS1
//#define DEBUG_PASS2
//#define DEBUG_PASS3
//#define DEBUG_PASS4
// returns the backend that should be used for the node based on the current locations // returns the backend that should be used for the node based on the current locations
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
// TODO: use supports_op to check if the backend supports the op // TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend // assign pre-allocated nodes to their backend
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) { if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst"); SET_CAUSE(tensor, "1.dst");
return cur_backend_id; return cur_backend_id;
@ -1139,7 +1148,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// view_src // view_src
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
if (cur_backend_id != -1) { if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc"); SET_CAUSE(tensor, "1.vsrc");
return cur_backend_id; return cur_backend_id;
@ -1161,7 +1170,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
continue; continue;
} }
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) { if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) { for (int b = 0; b < src_backend_id; b++) {
@ -1223,10 +1232,43 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
} }
} }
//#define DEBUG_PASS1 static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
//#define DEBUG_PASS2 if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
//#define DEBUG_PASS3 *node_backend_id = cur_backend_id;
//#define DEBUG_PASS4 SET_CAUSE(node, "2.2");
} else {
for (int b = 0; b < sched->n_backends; b++) {
if (b == cur_backend_id) {
continue;
}
if (ggml_backend_supports_op(sched->backends[b], node)) {
*node_backend_id = b;
cur_backend_id = b;
SET_CAUSE(node, "2.2");
break;
}
}
}
return cur_backend_id;
}
static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
ggml_backend_buffer_type_t buft = NULL;
if (buf) {
// the tensor is already allocated
buft = buf->buft;
} else {
// see if the tensor already has a backend assigned, and use the buffer type of that backend
int tensor_backend_id = tensor_backend_id(t);
if (tensor_backend_id != -1) {
buft = sched->bufts[tensor_backend_id];
}
}
return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft);
}
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@ -1306,9 +1348,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else { } else {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; // FIXME: clean this
SET_CAUSE(node, "2.2"); cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
if (cur_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
}
} }
} }
} }
@ -1328,9 +1374,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else { } else {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.1"); if (cur_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
}
} }
} }
} }
@ -1345,9 +1394,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.4");
} }
} }
} }
@ -1362,9 +1410,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.3");
} }
} }
} }
@ -1448,10 +1495,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
// check if the split has too many inputs // check if the split has too many inputs
// FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src); const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id]; int src_backend_id = sched->tensor_backend_id[id];
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { bool supported = buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true; need_new_split = true;
break; break;
@ -1486,7 +1535,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int src_backend_id = tensor_backend_id(src); const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src); size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) { if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id]; ggml_backend_t backend = sched->backends[src_backend_id];
@ -1511,7 +1560,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
if (src_backend_id != node_backend_id) { bool supported = buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend // create a copy of the input in the split's backend
const size_t id = hash_id(src); const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@ -1543,6 +1593,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// create copies of the graph for each split // create copies of the graph for each split
// TODO: avoid this copy // TODO: avoid this copy
// swap node_backend_ids and leaf_backend_ids and prevs
{
int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids;
sched->prev_node_backend_ids = tmp;
tmp = sched->leaf_backend_ids;
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
sched->prev_leaf_backend_ids = tmp;
}
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
for (int i = 0; i < sched->n_splits; i++) { for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i]; struct ggml_backend_sched_split * split = &sched->splits[i];
@ -1613,8 +1675,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false;
for (int i = 0; i < sched->graph->n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
if (!backend_ids_changed) {
for (int i = 0; i < sched->graph->n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
}
// allocate graph // allocate graph
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address // the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched); ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG #ifndef NDEBUG
@ -1735,6 +1813,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends; sched->n_backends = n_backends;
@ -1747,7 +1827,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
for (int b = 0; b < n_backends; b++) { for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b]; sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) { if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]); sched->events[b][c] = ggml_backend_event_new(backends[b]);

View file

@ -23,7 +23,6 @@ extern "C" {
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer // buffer
@ -74,6 +73,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends // tensor copy between different backends
@ -90,7 +90,7 @@ extern "C" {
GGML_API void ggml_backend_event_free (ggml_backend_event_t event); GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event); GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
// //
// CPU backend // CPU backend
@ -119,7 +119,7 @@ extern "C" {
GGML_API size_t ggml_backend_reg_get_count(void); GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);

257
ggml-blas.c Normal file
View file

@ -0,0 +1,257 @@
#include "ggml-blas.h"
#include "ggml-backend-impl.h"
#include <stdlib.h>
#if defined(GGML_USE_ACCELERATE)
# include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS)
# if defined(GGML_BLAS_USE_MKL)
# include <mkl.h>
# else
# include <cblas.h>
# endif
#endif
struct ggml_backend_blas_context {
int n_threads;
void * work_data;
size_t work_size;
};
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
//const int64_t ne00 = src0->ne[0];
//const int64_t ne01 = src0->ne[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1->type == GGML_TYPE_F32 &&
((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
}
return false;
}
static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
const enum ggml_type type = src0->type;
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
const int64_t ne_plane = ne01*ne00;
const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float);
if (ctx->work_size < desired_wsize) {
free(ctx->work_data);
ctx->work_data = malloc(desired_wsize);
GGML_ASSERT(ctx->work_data != NULL);
ctx->work_size = desired_wsize;
}
void * wdata = ctx->work_data;
// convert src0 to float
if (true) {
if (type != GGML_TYPE_F32) {
ggml_to_float_t const to_float = type_traits.to_float;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for num_threads(ctx->n_threads)
#endif
for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
}
}
}
}
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne1, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
}
}
}
// backend interface
GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
return "BLAS";
GGML_UNUSED(backend);
}
GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
free(backend);
}
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(backend);
}
GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_blas_mul_mat(ctx, node);
break;
// TODO
//case GGML_OP_OUT_PROD:
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
}
}
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}
GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op);
GGML_UNUSED(backend);
}
GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(backend);
}
static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_name,
/* .free = */ ggml_backend_blas_free,
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .supports_op = */ ggml_backend_blas_supports_op,
/* .supports_buft = */ ggml_backend_blas_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
};
static ggml_guid_t ggml_backend_blas_guid(void) {
static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
return &guid;
}
ggml_backend_t ggml_backend_blas_init(void) {
ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
if (backend == NULL) {
return NULL;
}
struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context));
if (ctx == NULL) {
return NULL;
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->work_size = 0;
*backend = (struct ggml_backend) {
/* .guid = */ ggml_backend_blas_guid(),
/* .interface = */ blas_backend_i,
/* .context = */ ctx,
};
return backend;
}
GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
}
void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
GGML_ASSERT(ggml_backend_is_blas(backend_blas));
struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context;
ctx->n_threads = n_threads;
}

22
ggml-blas.h Normal file
View file

@ -0,0 +1,22 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
#ifdef __cplusplus
}
#endif

View file

@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
return ctx->name.c_str(); return ctx->name.c_str();
} }
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
}
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_cuda(backend)) {
return false;
}
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return buft_ctx->device == cuda_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -929,6 +921,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
}
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor // instead, we allocate them for each tensor separately in init_tensor
@ -972,12 +968,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
return total_size; return total_size;
} }
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cuda(backend);
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false; return false;
@ -990,7 +980,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
}; };
@ -1090,7 +1079,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -2919,6 +2907,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cuda_split(buft)) {
return true;
}
if (ggml_backend_buft_is_cuda(buft)) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
return buft_ctx->device == cuda_ctx->device;
}
return false;
}
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const int min_batch_size = 32; const int min_batch_size = 32;
@ -2991,9 +2993,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .synchronize = */ ggml_backend_cuda_synchronize, /* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op, /* .supports_op = */ ggml_backend_cuda_supports_op,
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
/* .offload_op = */ ggml_backend_cuda_offload_op, /* .offload_op = */ ggml_backend_cuda_offload_op,
/* .event_new = */ ggml_backend_cuda_event_new, /* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free, /* .event_free = */ ggml_backend_cuda_event_free,

View file

@ -1899,18 +1899,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
return ctx->max_alloc; return ctx->max_alloc;
} }
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
GGML_UNUSED(buft);
return ggml_backend_is_kompute(backend);
}
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -1970,6 +1964,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
return ggml_vk_supports_op(op); return ggml_vk_supports_op(op);
} }
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
GGML_UNUSED(backend);
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
}
static struct ggml_backend_i kompute_backend_i = { static struct ggml_backend_i kompute_backend_i = {
/* .get_name = */ ggml_backend_kompute_name, /* .get_name = */ ggml_backend_kompute_name,
/* .free = */ ggml_backend_kompute_free, /* .free = */ ggml_backend_kompute_free,
@ -1980,9 +1979,11 @@ static struct ggml_backend_i kompute_backend_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_op = */ ggml_backend_kompute_supports_op,
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View file

@ -3040,12 +3040,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
@ -3060,7 +3054,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host, /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -3175,6 +3168,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
return ggml_metal_supports_op(metal_ctx, op); return ggml_metal_supports_op(metal_ctx, op);
} }
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_metal_i = { static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name, /* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free, /* .free = */ ggml_backend_metal_free,
@ -3185,9 +3184,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op, /* .supports_op = */ ggml_backend_metal_supports_op,
/* .supports_buft = */ ggml_backend_metal_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View file

@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
} }
GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_rpc(backend)) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
return buft_ctx->endpoint == rpc_ctx->endpoint;
}
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
/* .get_name = */ ggml_backend_rpc_buffer_type_name, /* .get_name = */ ggml_backend_rpc_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_rpc_get_max_size, /* .get_max_size = */ ggml_backend_rpc_get_max_size,
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
return false; return false;
} }
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
return buft_ctx->endpoint == rpc_ctx->endpoint;
}
static ggml_backend_i ggml_backend_rpc_interface = { static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name, /* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free, /* .free = */ ggml_backend_rpc_free,
@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .synchronize = */ ggml_backend_rpc_synchronize, /* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .graph_compute = */ ggml_backend_rpc_graph_compute,
/* .supports_op = */ ggml_backend_rpc_supports_op, /* .supports_op = */ ggml_backend_rpc_supports_op,
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View file

@ -16631,22 +16631,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_sycl(backend)) {
return false;
}
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return buft_ctx->device == sycl_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .get_name = */ ggml_backend_sycl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
/* .is_host = */ nullptr, /* .is_host = */ nullptr,
}; };
@ -16998,12 +16988,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
return total_size; return total_size;
} }
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_sycl(backend);
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false; return false;
@ -17016,7 +17000,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
}; };
@ -17102,7 +17085,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -17367,6 +17349,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
return false;
}
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return buft_ctx->device == sycl_ctx->device;
}
static ggml_backend_i ggml_backend_sycl_interface = { static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_name = */ ggml_backend_sycl_name, /* .get_name = */ ggml_backend_sycl_name,
@ -17378,9 +17368,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .synchronize = */ ggml_backend_sycl_synchronize, /* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ ggml_backend_sycl_supports_op, /* .supports_op = */ ggml_backend_sycl_supports_op,
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
/* .offload_op = */ ggml_backend_sycl_offload_op, /* .offload_op = */ ggml_backend_sycl_offload_op,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View file

@ -6110,24 +6110,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_vk(backend)) {
return false;
}
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
return buft_ctx->ctx->idx == ctx->idx;
}
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
/* .get_name = */ ggml_backend_vk_buffer_type_name, /* .get_name = */ ggml_backend_vk_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -6203,7 +6191,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -6524,6 +6511,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
UNUSED(backend); UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
return false;
}
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
return buft_ctx->ctx->idx == ctx->idx;
}
// TODO: enable async and synchronize // TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = { static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name, /* .get_name = */ ggml_backend_vk_name,
@ -6535,9 +6533,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize, /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute, /* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op, /* .supports_op = */ ggml_backend_vk_supports_op,
/* .supports_buft = */ ggml_backend_vk_supports_buft,
/* .offload_op = */ ggml_backend_vk_offload_op, /* .offload_op = */ ggml_backend_vk_offload_op,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

185
ggml.c
View file

@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
#include <mkl.h>
#else
#include <cblas.h>
#endif
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -12216,39 +12210,6 @@ static void ggml_compute_forward_group_norm(
// ggml_compute_forward_mul_mat // ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
//const int64_t ne00 = src0->ne[0];
//const int64_t ne01 = src0->ne[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
// all the experts for each batch element and the processing would become incredibly slow
// TODO: find the optimal values for these
if (dst->op != GGML_OP_MUL_MAT_ID &&
ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
//src0->type == GGML_TYPE_F32 &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
}
return false;
}
#endif
static void ggml_compute_forward_mul_mat_one_chunk( static void ggml_compute_forward_mul_mat_one_chunk(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst, struct ggml_tensor * dst,
@ -12386,73 +12347,6 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
const int64_t ne_plane = ne01*ne00;
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize);
if (params->type == GGML_TASK_TYPE_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
}
return;
}
if (params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
// perform sgemm, parallelization controlled by blas lib
if (ith != 0) {
return;
}
//const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne1, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
}
}
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
return;
}
#endif
#if GGML_USE_LLAMAFILE #if GGML_USE_LLAMAFILE
const bool src1_cont = ggml_is_contiguous(src1); const bool src1_cont = ggml_is_contiguous(src1);
@ -12833,19 +12727,7 @@ static void ggml_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
bool use_blas = ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
#endif
if (params->type == GGML_TASK_TYPE_INIT) { if (params->type == GGML_TASK_TYPE_INIT) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
if (use_blas) {
return;
}
#endif
if (ith != 0) { if (ith != 0) {
return; return;
} }
@ -12857,50 +12739,6 @@ static void ggml_compute_forward_out_prod_f32(
return; return;
} }
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (use_blas) {
if (params->ith != 0) { // All threads other than the first do no work.
return;
}
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
// src0: (k,n)
// src1: (k,m)
// dst: (m,n)
//
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
// Also expressed as (major,minor)
// a: (m,k): so src1 transposed
// b: (k,n): so src0
// c: (m,n)
//
// However, if ggml_is_transposed(src1) is true, then
// src1->data already contains a transposed version, so sgemm mustn't
// transpose it further.
int n = src0->ne[0];
int k = src0->ne[1];
int m = src1->ne[0];
int transposeA, lda;
if (!ggml_is_transposed(src1)) {
transposeA = CblasTrans;
lda = m;
} else {
transposeA = CblasNoTrans;
lda = k;
}
float * a = (float *) ((char *) src1->data);
float * b = (float *) ((char *) src0->data);
float * c = (float *) ((char *) dst->data);
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
return;
}
#endif
// dst[:,:,:,:] = 0 // dst[:,:,:,:] = 0
// for i2,i3: // for i2,i3:
// for i1: // for i1:
@ -13030,8 +12868,6 @@ static void ggml_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (params->type == GGML_TASK_TYPE_INIT) { if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) { if (ith != 0) {
return; return;
@ -13428,6 +13264,8 @@ static void ggml_compute_forward_get_rows_q(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
dequantize_row_q( dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@ -13471,6 +13309,8 @@ static void ggml_compute_forward_get_rows_f16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
ggml_fp16_to_fp32_row( ggml_fp16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@ -13514,7 +13354,9 @@ static void ggml_compute_forward_get_rows_bf16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
ggml_bf16_to_fp32_row( assert(i01 >= 0 && i01 < ne01);
ggml_bf16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} }
@ -13557,6 +13399,8 @@ static void ggml_compute_forward_get_rows_f32(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
ggml_vec_cpy_f32(nc, ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@ -19504,17 +19348,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
{ {
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory for fully dequantized matrix from src0
// take into account that src0 can be broadcasted into src1[2,3]
cur = ggml_type_size(GGML_TYPE_F32)
* node->src[0]->ne[0]*node->src[0]->ne[1]
* node->src[1]->ne[2]*node->src[1]->ne[3];
}
} else
#endif
if (node->src[1]->type != vec_dot_type) { if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
} }

View file

@ -21,6 +21,10 @@
# include "ggml-kompute.h" # include "ggml-kompute.h"
#endif #endif
#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
# include "ggml-blas.h"
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
# include "ggml-metal.h" # include "ggml-metal.h"
#endif #endif
@ -2298,9 +2302,13 @@ struct llama_context {
std::vector<ggml_backend_t> backends; std::vector<ggml_backend_t> backends;
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
ggml_backend_t backend_metal = nullptr; ggml_backend_t backend_metal = nullptr;
#endif
#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
ggml_backend_t backend_blas = nullptr;
#endif #endif
ggml_backend_t backend_cpu = nullptr; ggml_backend_t backend_cpu = nullptr;
const llama_model & model; const llama_model & model;
// key + value cache for the self attention // key + value cache for the self attention
@ -11516,17 +11524,17 @@ static struct ggml_cgraph * llama_build_graph(
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched // FIXME: fix in ggml_backend_sched
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
if (batch.n_tokens < 32 || full_offload) { //if (batch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) { // if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) { // for (auto * backend : lctx.backends) {
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { // if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); // ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
break; // break;
} // }
} // }
} // }
} //}
}; };
struct ggml_cgraph * result = NULL; struct ggml_cgraph * result = NULL;
@ -12017,6 +12025,11 @@ static void llama_graph_compute(
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
} }
#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
if (lctx.backend_blas != nullptr) {
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
}
#endif
ggml_backend_sched_graph_compute_async(lctx.sched, gf); ggml_backend_sched_graph_compute_async(lctx.sched, gf);
@ -12246,9 +12259,9 @@ static int llama_decode_internal(
// with the BLAS calls. need a better solution // with the BLAS calls. need a better solution
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance. // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads); // n_threads = std::min(4, n_threads);
} //}
ggml_backend_sched_alloc_graph(lctx.sched, gf); ggml_backend_sched_alloc_graph(lctx.sched, gf);
@ -16226,6 +16239,16 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
#endif #endif
#if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
ctx->backend_blas = ggml_backend_blas_init();
if (ctx->backend_blas == nullptr) {
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
} else {
ctx->backends.push_back(ctx->backend_blas);
}
#endif
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) { for (const auto & endpoint : model->rpc_servers) {