code cleanup

This commit is contained in:
slaren 2024-03-15 17:43:53 +01:00
parent c2dba0450f
commit 3a774427ae
2 changed files with 93 additions and 155 deletions

View file

@ -1122,27 +1122,26 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// TODO: use supports_op to check if the backend supports the op // TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend // assign pre-allocated nodes to their backend
// dst int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor); if (cur_backend_id != -1) {
if (cur_backend != -1) {
SET_CAUSE(tensor, "1.dst"); SET_CAUSE(tensor, "1.dst");
return cur_backend; return cur_backend_id;
} }
// view_src // view_src
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
if (cur_backend != -1) { if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc"); SET_CAUSE(tensor, "1.vsrc");
return cur_backend; return cur_backend_id;
} }
} }
// input // graph input
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
cur_backend = sched->n_backends - 1; // last backend (assumed CPU) cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
SET_CAUSE(tensor, "1.inp"); SET_CAUSE(tensor, "1.inp");
return cur_backend; return cur_backend_id;
} }
// assign nodes that use weights to the backend of the weights // assign nodes that use weights to the backend of the weights
@ -1153,10 +1152,10 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
continue; continue;
} }
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
// check if a backend with higher prio wants to run the op // check if a backend with higher prio wants to offload the op
if (src_backend == sched->n_backends - 1) { if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend; b++) { for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_offload_op(sched->backends[b], tensor)) { if (ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off"); SET_CAUSE(tensor, "1.off");
return b; return b;
@ -1164,7 +1163,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
} }
} }
SET_CAUSE(tensor, "1.wgt%d", i); SET_CAUSE(tensor, "1.wgt%d", i);
return src_backend; return src_backend_id;
} }
} }
@ -1244,28 +1243,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 1: assign backends to ops with pre-allocated inputs // pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
if (tensor_backend_id(leaf) != -1) { int * leaf_backend_id = &tensor_backend_id(leaf);
if (*leaf_backend_id != -1) {
// do not overwrite user assignments // do not overwrite user assignments
continue; continue;
} }
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
} }
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
if (tensor_backend_id(node) != -1) { int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
// do not overwrite user assignments // do not overwrite user assignments
continue; continue;
} }
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
// src // src
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
if (tensor_backend_id(src) == -1) { int * src_backend_id = &tensor_backend_id(src);
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); if (*src_backend_id == -1) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
} }
} }
} }
@ -1287,21 +1289,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
int tensor_backend_id = tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (tensor_backend_id != -1) { if (*node_backend_id != -1) {
if (tensor_backend_id == sched->n_backends - 1) { if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend) // skip cpu (lowest prio backend)
cur_backend_id = -1; cur_backend_id = -1;
} else { } else {
cur_backend_id = tensor_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else {
tensor_backend_id(node) = cur_backend_id; *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2"); SET_CAUSE(node, "2.2");
} }
} }
} }
// pass 2.1 expand gpu up // pass 2.1 expand gpu up
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
@ -1310,22 +1311,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
int tensor_backend_id = tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (tensor_backend_id != -1) { if (*node_backend_id != -1) {
if (tensor_backend_id == sched->n_backends - 1) { if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend) // skip cpu (lowest prio backend)
cur_backend_id = -1; cur_backend_id = -1;
} else { } else {
cur_backend_id = tensor_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else {
tensor_backend_id(node) = cur_backend_id; *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.1"); SET_CAUSE(node, "2.1");
} }
} }
} }
// pass 2.4 expand rest down // pass 2.4 expand rest down
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
@ -1334,11 +1333,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
int tensor_backend_id = tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (tensor_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = tensor_backend_id; cur_backend_id = *node_backend_id;
} else { } else {
tensor_backend_id(node) = cur_backend_id; *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.4"); SET_CAUSE(node, "2.4");
} }
} }
@ -1351,11 +1350,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
continue; continue;
} }
int tensor_backend_id = tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (tensor_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = tensor_backend_id; cur_backend_id = *node_backend_id;
} else { } else {
tensor_backend_id(node) = cur_backend_id; *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.3"); SET_CAUSE(node, "2.3");
} }
} }
@ -1368,9 +1367,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 3: assign backends to remaining src from dst and view_src // pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
int cur_backend_id = tensor_backend_id(node); int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && cur_backend_id == -1) { if (node->view_src != NULL && *cur_backend_id == -1) {
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); *cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc"); SET_CAUSE(node, "3.vsrc");
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -1378,14 +1377,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
int src_backend_id = tensor_backend_id(src); int * src_backend_id = &tensor_backend_id(src);
if (src_backend_id == -1) { if (*src_backend_id == -1) {
if (src->view_src != NULL) { if (src->view_src != NULL) {
// views are always on the same backend as the source // views are always on the same backend as the source
tensor_backend_id(src) = tensor_backend_id(src->view_src); *src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc"); SET_CAUSE(src, "3.vsrc");
} else { } else {
tensor_backend_id(src) = cur_backend_id; *src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur"); SET_CAUSE(src, "3.cur");
} }
} }
@ -1397,19 +1396,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 4: split graph, find tensors that need to be copied // pass 4: split graph, find tensors that need to be copied
{ {
int cur_split = 0; int i_split = 0;
struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops // find the backend of the first split, skipping view ops
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) { if (!ggml_is_view_op(node->op)) {
sched->splits[0].backend_id = tensor_backend_id(node); split->backend_id = tensor_backend_id(node);
break; break;
} }
} }
sched->splits[0].i_start = 0; split->i_start = 0;
sched->splits[0].n_inputs = 0; split->n_inputs = 0;
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK memset(split->inputs, 0, sizeof(split->inputs)); //HACK
int cur_backend_id = sched->splits[0].backend_id; int cur_backend_id = split->backend_id;
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
@ -1417,13 +1417,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue; continue;
} }
int tensor_backend_id = tensor_backend_id(node); const int node_backend_id = tensor_backend_id(node);
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
// check if a weight is on a different backend and start a new split if so // check if a weight is on a different backend and start a new split if so
// by starting a new split, the memory of the previously offloaded weights can be reused
bool offload = false; bool offload = false;
if (tensor_backend_id == cur_backend_id && sched->splits[cur_split].n_inputs > 0) { if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (src == NULL) { if (src == NULL) {
@ -1439,14 +1440,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
if (tensor_backend_id != cur_backend_id || offload) { if (node_backend_id != cur_backend_id || offload) {
sched->splits[cur_split].i_end = i; split->i_end = i;
cur_split++; i_split++;
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS); GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
sched->splits[cur_split].backend_id = tensor_backend_id; split = &sched->splits[i_split];
sched->splits[cur_split].i_start = i; split->backend_id = node_backend_id;
sched->splits[cur_split].n_inputs = 0; split->i_start = i;
cur_backend_id = tensor_backend_id; split->n_inputs = 0;
cur_backend_id = node_backend_id;
} }
// find inputs that are not on the same backend // find inputs that are not on the same backend
@ -1456,10 +1458,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue; continue;
} }
int src_backend_id = tensor_backend_id(src); const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT) { if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src); size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) { if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id]; ggml_backend_t backend = sched->backends[src_backend_id];
@ -1476,7 +1478,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
} }
sched->tensor_copies[id][src_backend_id][c] = tensor_copy; sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = src_backend_id;
SET_CAUSE(tensor_copy, "4.cpy"); SET_CAUSE(tensor_copy, "4.cpy");
} }
int n_graph_inputs = sched->n_graph_inputs++; int n_graph_inputs = sched->n_graph_inputs++;
@ -1485,9 +1486,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
if (src_backend_id != tensor_backend_id) { if (src_backend_id != node_backend_id) {
// create a copy of the input in the split's backend // create a copy of the input in the split's backend
size_t id = hash_id(src); const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id]; ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
@ -1498,60 +1499,23 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
} }
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = cur_backend_id;
SET_CAUSE(tensor_copy, "4.cpy"); SET_CAUSE(tensor_copy, "4.cpy");
} }
int n_inputs = sched->splits[cur_split].n_inputs++; int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src; split->inputs[n_inputs] = src;
} }
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
} }
} }
} }
sched->splits[cur_split].i_end = graph->n_nodes; split->i_end = graph->n_nodes;
sched->n_splits = cur_split + 1; sched->n_splits = i_split + 1;
} }
#ifdef DEBUG_PASS4 #ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif #endif
#ifndef NDEBUG
// sanity check: all sources should have the same backend as the node
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
if (tensor_backend == NULL) {
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
}
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
if (src_backend != tensor_backend /* && src_backend != NULL */) {
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
}
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
}
}
}
fflush(stderr);
#endif
// create copies of the graph for each split // create copies of the graph for each split
// TODO: avoid this copy // TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false); struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
@ -1562,12 +1526,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) { for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy]; const size_t input_id = hash_id(input);
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
// add a dependency to the input source so that it is not freed before the copy is done // add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input; input_dep->src[0] = input;
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep; graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split // add a dependency to the input copy so that it is allocated at the start of the split
@ -1736,7 +1701,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
// initialize hash table // initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
@ -1811,7 +1776,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
} }
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
ggml_backend_sched_split_graph(sched, graph); ggml_backend_sched_split_graph(sched, graph);

View file

@ -9286,8 +9286,8 @@ static void ggml_cuda_op_mul_mat(
used_devices++; used_devices++;
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const bool src1_on_device = id == g_main_device; // TODO: check from buffer
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const bool dst_on_device = id == g_main_device;
ggml_cuda_set_device(id); ggml_cuda_set_device(id);
cudaStream_t stream = g_cudaStreams[id][0]; cudaStream_t stream = g_cudaStreams[id][0];
@ -9338,8 +9338,8 @@ static void ggml_cuda_op_mul_mat(
continue; continue;
} }
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const bool src1_on_device = id == g_main_device; // TODO: check from buffer
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const bool dst_on_device = id == g_main_device;
const int64_t row_diff = dev[id].row_high - dev[id].row_low; const int64_t row_diff = dev[id].row_high - dev[id].row_low;
ggml_cuda_set_device(id); ggml_cuda_set_device(id);
@ -9364,12 +9364,12 @@ static void ggml_cuda_op_mul_mat(
// the main device memory buffer can be on VRAM scratch, with space for all partial results // the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed // in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { if (id == g_main_device) {
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
} }
// copy src0, src1 to device if necessary // copy src0, src1 to device if necessary
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (src1_is_contiguous) {
if (id != g_main_device) { if (id != g_main_device) {
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
@ -10351,18 +10351,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
} }
} }
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false; if (!g_cublas_loaded) return false;
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
return false;
}
if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->op == GGML_OP_MUL_MAT) {
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG #ifndef NDEBUG
@ -10372,6 +10363,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
} }
} }
ggml_cuda_func_t func;
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
func = ggml_cuda_repeat; func = ggml_cuda_repeat;
@ -10449,15 +10442,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
func = ggml_cuda_rms_norm; func = ggml_cuda_rms_norm;
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
return false;
}
func = ggml_cuda_mul_mat; func = ggml_cuda_mul_mat;
break; break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
return false;
}
func = ggml_cuda_mul_mat_id; func = ggml_cuda_mul_mat_id;
break; break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
@ -10514,12 +10501,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
} }
if (params->ith != 0) {
return true;
}
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return true;
}
func(tensor->src[0], tensor->src[1], tensor); func(tensor->src[0], tensor->src[1], tensor);
return true; return true;
} }
@ -10636,13 +10617,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
size_t original_size = ggml_nbytes(tensor); size_t original_size = ggml_nbytes(tensor);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
//printf("%s: data: %p, original: %lu, padded: %lu, diff: %lu\n", tensor->name, tensor->data, original_size, padded_size, padded_size - original_size);
//printf("buffer range: %p - %p\n", ctx->dev_ptr, (char *)ctx->dev_ptr + buffer->size);
if (padded_size > original_size && tensor->view_src == nullptr) { if (padded_size > original_size && tensor->view_src == nullptr) {
//CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
//printf("cudaMemset(%p, %d, %ld)\n", (char *)tensor->data + original_size, 0, padded_size - original_size);
//CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
} }
} }
} }
@ -11254,9 +11230,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
ggml_cuda_set_main_device(cuda_ctx->device); ggml_cuda_set_main_device(cuda_ctx->device);
ggml_compute_params params = {};
params.type = GGML_TASK_TYPE_COMPUTE;
params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@ -11278,7 +11251,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
} }
#endif #endif
bool ok = ggml_cuda_compute_forward(&params, node); bool ok = ggml_cuda_compute_forward(node);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
} }