move BLAS to a separate backend (#6210)

* move BLAS to a separate backend

* rename GGML_USE_OPENBLAS to GGML_USE_BLAS

* alloc : reuse same buffer when the same buffer type if used multiple times

* set number of threads automatically for openblas and blis

* sched : print assignments when GGML_SCHED_DEBUG env variable is set

* sched : allow ops with weights on an incompatible buffer type

This will cause the weight to be copied to a backend that supports the
op, which is very costly. The weight should have been stored in a buffer
of a backend that can run the op, but llama.cpp cannot do this
automatically at the moment.

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2024-06-13 03:11:35 +02:00 committed by GitHub
parent 1c641e6aac
commit f578b86b21
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 821 additions and 379 deletions

View file

@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
return ggml_nbytes(tensor);
}
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return buft->iface.supports_backend(buft, backend);
}
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
if (buft->iface.is_host) {
return buft->iface.is_host(buft);
@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
return backend->iface.supports_op(backend, op);
}
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return backend->iface.supports_buft(backend, buft);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
GGML_UNUSED(backend);
}
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(backend);
}
static struct ggml_backend_i cpu_backend_i = {
/* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free,
@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size]
int * prev_node_backend_ids; // [graph_size]
int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs
struct ggml_cgraph * graph;
@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data;
bool debug;
// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
return -1;
}
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
ggml_backend_buffer_t buffer = tensor->buffer;
if (buffer == NULL) {
return -1;
}
// find highest prio backend that supports the buffer type
// find highest prio backend that supports the buffer type and the op
for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
ggml_backend_supports_op(sched->backends[i], op)) {
return i;
}
}
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
__func__, ggml_backend_buffer_name(buffer), tensor->name);
GGML_ASSERT(false);
#ifndef NDEBUG
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
#endif
return -1;
}
@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst");
return cur_backend_id;
@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// view_src
if (tensor->view_src != NULL) {
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc");
return cur_backend_id;
@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) {
@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
}
//#define DEBUG_PASS1
//#define DEBUG_PASS2
//#define DEBUG_PASS3
//#define DEBUG_PASS4
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
ggml_backend_buffer_type_t buft = NULL;
if (buf) {
// the tensor is already allocated
buft = buf->buft;
} else {
// see if the tensor already has a backend assigned, and use the buffer type of that backend
int tensor_backend_id = tensor_backend_id(t);
if (tensor_backend_id == -1 && t->view_src) {
tensor_backend_id = tensor_backend_id(t->view_src);
}
if (tensor_backend_id != -1) {
buft = sched->bufts[tensor_backend_id];
}
}
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
}
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.sup");
}
}
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
#ifdef DEBUG_PASS1
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 2: expand current backend assignments
// assign the same backend to adjacent nodes
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
// pass 2.2 expand gpu down
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
// expand gpu down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else {
cur_backend_id = *node_backend_id;
}
} else {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2");
} else if (cur_backend_id != -1) {
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
// pass 2.1 expand gpu up
// expand gpu up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else {
cur_backend_id = *node_backend_id;
}
} else {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.1");
} else if (cur_backend_id != -1) {
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
// pass 2.4 expand rest down
// expand rest down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.4");
} else if (cur_backend_id != -1) {
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
// pass 2.3 expand rest up
// expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.3");
} else if (cur_backend_id != -1) {
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
#ifdef DEBUG_PASS2
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
// however, we also need to verify that the sources are in compatible buffer types
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id == -1) {
// unassigned node: find the backend with the most supported inputs
int n_supported_best = -1;
for (int b = 0; b < sched->n_backends; b++) {
if (ggml_backend_supports_op(sched->backends[b], node)) {
int n_supported = 0;
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
n_supported++;
}
}
if (n_supported > n_supported_best) {
n_supported_best = n_supported;
*node_backend_id = b;
SET_CAUSE(node, "3.best");
}
}
}
} else {
// assigned node: upgrade to higher prio backend if possible
for (int b = 0; b < *node_backend_id; b++) {
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
bool supported = true;
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
supported = false;
break;
}
}
if (supported) {
*node_backend_id = b;
SET_CAUSE(node, "3.upg");
break;
}
}
}
}
}
// pass 3: assign backends to remaining src from dst and view_src
// pass 4: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && *cur_backend_id == -1) {
*cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc");
SET_CAUSE(node, "4.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src->view_src != NULL) {
// views are always on the same backend as the source
*src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc");
SET_CAUSE(src, "4.vsrc");
} else {
*src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur");
SET_CAUSE(src, "4.cur");
}
}
}
}
#ifdef DEBUG_PASS3
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 4: split graph, find tensors that need to be copied
{
@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
// check if the split has too many inputs
// FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id];
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
if (src_backend_id != node_backend_id) {
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend
const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
split->i_end = graph->n_nodes;
sched->n_splits = i_split + 1;
}
#ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
if (sched->debug) {
ggml_backend_sched_print_assignments(sched, graph);
}
// swap node_backend_ids and leaf_backend_ids and prevs
{
int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids;
sched->prev_node_backend_ids = tmp;
tmp = sched->leaf_backend_ids;
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
sched->prev_leaf_backend_ids = tmp;
}
// create copies of the graph for each split
// TODO: avoid this copy
@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false;
for (int i = 0; i < sched->graph->n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
if (!backend_ids_changed) {
for (int i = 0; i < sched->graph->n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
}
// allocate graph
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends;
@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]);
@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched->tensor_copies);
free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids);
free(sched);
}
@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr");
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {