sched : print assignments when GGML_SCHED_DEBUG env variable is set

This commit is contained in:
slaren 2024-06-07 20:00:38 +02:00
parent a8a1bf7981
commit ecb75b5f54
3 changed files with 30 additions and 50 deletions

View file

@ -1078,6 +1078,8 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data;
bool debug;
// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
#define GET_CAUSE(node) ""
#endif
//#define DEBUG_PASS1
//#define DEBUG_PASS2
//#define DEBUG_PASS3
//#define DEBUG_PASS4
// returns the backend that should be used for the node based on the current locations
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
// TODO: use supports_op to check if the backend supports the op
@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
}
static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2");
@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node
return cur_backend_id;
}
static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
ggml_backend_buffer_type_t buft = NULL;
@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
#ifdef DEBUG_PASS1
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 2: expand current backend assignments
// assign the same backend to adjacent nodes
@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
} else if (cur_backend_id != -1) {
// FIXME: clean this
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
if (cur_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
cur_backend_id = *node_backend_id;
}
} else if (cur_backend_id != -1) {
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
if (cur_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else if (cur_backend_id != -1) {
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
}
}
}
@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else if (cur_backend_id != -1) {
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
}
}
}
#ifdef DEBUG_PASS2
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
#ifdef DEBUG_PASS3
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 4: split graph, find tensors that need to be copied
{
@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id];
bool supported = buffer_supported(sched, src, cur_backend_id);
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
bool supported = buffer_supported(sched, src, cur_backend_id);
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend
const size_t id = hash_id(src);
@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
split->i_end = graph->n_nodes;
sched->n_splits = i_split + 1;
}
#ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// create copies of the graph for each split
// TODO: avoid this copy
if (sched->debug) {
ggml_backend_sched_print_assignments(sched, graph);
}
// swap node_backend_ids and leaf_backend_ids and prevs
{
@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->prev_leaf_backend_ids = tmp;
}
// create copies of the graph for each split
// TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));

View file

@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);

View file

@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph(
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
//const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
//if (batch.n_tokens < 32 || full_offload) {
// if (il != -1 && strcmp(name, "norm") == 0) {
// for (auto * backend : lctx.backends) {
// if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
// ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
// break;
// }
// }
// }
//}
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
if (batch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) {
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
break;
}
}
}
}
};
struct ggml_cgraph * result = NULL;
@ -12261,17 +12262,6 @@ static int llama_decode_internal(
}
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
//if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
// n_threads = std::min(4, n_threads);
//}
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llama_set_inputs(lctx, u_batch);