code cleanup
This commit is contained in:
parent
c2dba0450f
commit
3a774427ae
2 changed files with 93 additions and 155 deletions
199
ggml-backend.c
199
ggml-backend.c
|
@ -1122,27 +1122,26 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
// TODO: use supports_op to check if the backend supports the op
|
// TODO: use supports_op to check if the backend supports the op
|
||||||
|
|
||||||
// assign pre-allocated nodes to their backend
|
// assign pre-allocated nodes to their backend
|
||||||
// dst
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||||
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
if (cur_backend_id != -1) {
|
||||||
if (cur_backend != -1) {
|
|
||||||
SET_CAUSE(tensor, "1.dst");
|
SET_CAUSE(tensor, "1.dst");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// view_src
|
// view_src
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||||
if (cur_backend != -1) {
|
if (cur_backend_id != -1) {
|
||||||
SET_CAUSE(tensor, "1.vsrc");
|
SET_CAUSE(tensor, "1.vsrc");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
SET_CAUSE(tensor, "1.inp");
|
SET_CAUSE(tensor, "1.inp");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign nodes that use weights to the backend of the weights
|
// assign nodes that use weights to the backend of the weights
|
||||||
|
@ -1153,10 +1152,10 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||||
// check if a backend with higher prio wants to run the op
|
// check if a backend with higher prio wants to offload the op
|
||||||
if (src_backend == sched->n_backends - 1) {
|
if (src_backend_id == sched->n_backends - 1) {
|
||||||
for (int b = 0; b < src_backend; b++) {
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||||
SET_CAUSE(tensor, "1.off");
|
SET_CAUSE(tensor, "1.off");
|
||||||
return b;
|
return b;
|
||||||
|
@ -1164,7 +1163,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SET_CAUSE(tensor, "1.wgt%d", i);
|
SET_CAUSE(tensor, "1.wgt%d", i);
|
||||||
return src_backend;
|
return src_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1244,28 +1243,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 1: assign backends to ops with pre-allocated inputs
|
// pass 1: assign backends to ops with pre-allocated inputs
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
if (tensor_backend_id(leaf) != -1) {
|
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||||
|
if (*leaf_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (tensor_backend_id(node) != -1) {
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
|
if (*node_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||||
// src
|
// src
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (tensor_backend_id(src) == -1) {
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
if (*src_backend_id == -1) {
|
||||||
|
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1287,21 +1289,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.2");
|
SET_CAUSE(node, "2.2");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass 2.1 expand gpu up
|
// pass 2.1 expand gpu up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1310,22 +1311,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.1");
|
SET_CAUSE(node, "2.1");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// pass 2.4 expand rest down
|
// pass 2.4 expand rest down
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1334,16 +1333,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.4");
|
SET_CAUSE(node, "2.4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// pass 2.3 expand rest up
|
// pass 2.3 expand rest up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||||
|
@ -1351,11 +1350,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.3");
|
SET_CAUSE(node, "2.3");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1368,9 +1367,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 3: assign backends to remaining src from dst and view_src
|
// pass 3: assign backends to remaining src from dst and view_src
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
int cur_backend_id = tensor_backend_id(node);
|
int * cur_backend_id = &tensor_backend_id(node);
|
||||||
if (node->view_src != NULL && cur_backend_id == -1) {
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||||
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||||
SET_CAUSE(node, "3.vsrc");
|
SET_CAUSE(node, "3.vsrc");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
@ -1378,14 +1377,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
if (src_backend_id == -1) {
|
if (*src_backend_id == -1) {
|
||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
// views are always on the same backend as the source
|
// views are always on the same backend as the source
|
||||||
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
*src_backend_id = tensor_backend_id(src->view_src);
|
||||||
SET_CAUSE(src, "3.vsrc");
|
SET_CAUSE(src, "3.vsrc");
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(src) = cur_backend_id;
|
*src_backend_id = *cur_backend_id;
|
||||||
SET_CAUSE(src, "3.cur");
|
SET_CAUSE(src, "3.cur");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1397,19 +1396,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
int cur_split = 0;
|
int i_split = 0;
|
||||||
|
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||||
// find the backend of the first split, skipping view ops
|
// find the backend of the first split, skipping view ops
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (!ggml_is_view_op(node->op)) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
sched->splits[0].backend_id = tensor_backend_id(node);
|
split->backend_id = tensor_backend_id(node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[0].i_start = 0;
|
split->i_start = 0;
|
||||||
sched->splits[0].n_inputs = 0;
|
split->n_inputs = 0;
|
||||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||||
int cur_backend_id = sched->splits[0].backend_id;
|
int cur_backend_id = split->backend_id;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
@ -1417,13 +1417,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
const int node_backend_id = tensor_backend_id(node);
|
||||||
|
|
||||||
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
// check if a weight is on a different backend and start a new split if so
|
// check if a weight is on a different backend and start a new split if so
|
||||||
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
bool offload = false;
|
bool offload = false;
|
||||||
if (tensor_backend_id == cur_backend_id && sched->splits[cur_split].n_inputs > 0) {
|
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
|
@ -1439,14 +1440,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor_backend_id != cur_backend_id || offload) {
|
if (node_backend_id != cur_backend_id || offload) {
|
||||||
sched->splits[cur_split].i_end = i;
|
split->i_end = i;
|
||||||
cur_split++;
|
i_split++;
|
||||||
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||||
sched->splits[cur_split].backend_id = tensor_backend_id;
|
split = &sched->splits[i_split];
|
||||||
sched->splits[cur_split].i_start = i;
|
split->backend_id = node_backend_id;
|
||||||
sched->splits[cur_split].n_inputs = 0;
|
split->i_start = i;
|
||||||
cur_backend_id = tensor_backend_id;
|
split->n_inputs = 0;
|
||||||
|
cur_backend_id = node_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find inputs that are not on the same backend
|
// find inputs that are not on the same backend
|
||||||
|
@ -1456,10 +1458,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int src_backend_id = tensor_backend_id(src);
|
const int src_backend_id = tensor_backend_id(src);
|
||||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
|
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||||
size_t id = hash_id(src);
|
size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||||
|
@ -1476,7 +1478,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = src_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_graph_inputs = sched->n_graph_inputs++;
|
int n_graph_inputs = sched->n_graph_inputs++;
|
||||||
|
@ -1485,9 +1486,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_backend_id != tensor_backend_id) {
|
if (src_backend_id != node_backend_id) {
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
|
@ -1498,60 +1499,23 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = cur_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
int n_inputs = split->n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
split->inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
split->i_end = graph->n_nodes;
|
||||||
sched->n_splits = cur_split + 1;
|
sched->n_splits = i_split + 1;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS4
|
#ifdef DEBUG_PASS4
|
||||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
// sanity check: all sources should have the same backend as the node
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
|
||||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
||||||
if (tensor_backend == NULL) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
|
||||||
}
|
|
||||||
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * src = node->src[j];
|
|
||||||
if (src == NULL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
||||||
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
|
||||||
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
|
||||||
}
|
|
||||||
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
|
||||||
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fflush(stderr);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
||||||
|
@ -1562,12 +1526,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
const size_t input_id = hash_id(input);
|
||||||
|
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||||
|
|
||||||
// add a dependency to the input source so that it is not freed before the copy is done
|
// add a dependency to the input source so that it is not freed before the copy is done
|
||||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||||
input_dep->src[0] = input;
|
input_dep->src[0] = input;
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||||
|
|
||||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||||
|
@ -1736,7 +1701,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
||||||
|
@ -1811,7 +1776,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
|
|
||||||
|
|
49
ggml-cuda.cu
49
ggml-cuda.cu
|
@ -9286,8 +9286,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
used_devices++;
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool dst_on_device = id == g_main_device;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
cudaStream_t stream = g_cudaStreams[id][0];
|
cudaStream_t stream = g_cudaStreams[id][0];
|
||||||
|
@ -9338,8 +9338,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool dst_on_device = id == g_main_device;
|
||||||
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
|
@ -9364,12 +9364,12 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||||
// in that case an offset on dst_ddf_i is needed
|
// in that case an offset on dst_ddf_i is needed
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
if (id == g_main_device) {
|
||||||
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy src0, src1 to device if necessary
|
// copy src0, src1 to device if necessary
|
||||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
if (src1_is_contiguous) {
|
||||||
if (id != g_main_device) {
|
if (id != g_main_device) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
||||||
|
@ -10351,18 +10351,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
@ -10372,6 +10363,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_cuda_func_t func;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
func = ggml_cuda_repeat;
|
func = ggml_cuda_repeat;
|
||||||
|
@ -10449,15 +10442,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
func = ggml_cuda_rms_norm;
|
func = ggml_cuda_rms_norm;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_mul_mat;
|
func = ggml_cuda_mul_mat;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_mul_mat_id;
|
func = ggml_cuda_mul_mat_id;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
|
@ -10514,12 +10501,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params->ith != 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
func(tensor->src[0], tensor->src[1], tensor);
|
func(tensor->src[0], tensor->src[1], tensor);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -10636,13 +10617,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
size_t original_size = ggml_nbytes(tensor);
|
size_t original_size = ggml_nbytes(tensor);
|
||||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||||
|
|
||||||
//printf("%s: data: %p, original: %lu, padded: %lu, diff: %lu\n", tensor->name, tensor->data, original_size, padded_size, padded_size - original_size);
|
|
||||||
//printf("buffer range: %p - %p\n", ctx->dev_ptr, (char *)ctx->dev_ptr + buffer->size);
|
|
||||||
|
|
||||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||||
//CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||||
//printf("cudaMemset(%p, %d, %ld)\n", (char *)tensor->data + original_size, 0, padded_size - original_size);
|
|
||||||
//CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11254,9 +11230,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
|
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
|
||||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
||||||
params.ith = 0;
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
@ -11278,7 +11251,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
bool ok = ggml_cuda_compute_forward(node);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue