diff --git a/ggml-backend.c b/ggml-backend.c index 2cc9e09a8..80e129cf8 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1078,6 +1078,8 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; + bool debug; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) @@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED #define GET_CAUSE(node) "" #endif -//#define DEBUG_PASS1 -//#define DEBUG_PASS2 -//#define DEBUG_PASS3 -//#define DEBUG_PASS4 - // returns the backend that should be used for the node based on the current locations static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { // TODO: use supports_op to check if the backend supports the op @@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { +static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.2"); @@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node return cur_backend_id; } -static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { +static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) { ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_type_t buft = NULL; @@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS1 - fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 2: expand current backend assignments // assign the same backend to adjacent nodes @@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } else if (cur_backend_id != -1) { // FIXME: clean this - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); if (cur_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; @@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg cur_backend_id = *node_backend_id; } } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); if (cur_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; @@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } @@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { - cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id); + cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id); } } } -#ifdef DEBUG_PASS2 - fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif - // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS3 - fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 4: split graph, find tensors that need to be copied { @@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->tensor_backend_id[id]; - bool supported = buffer_supported(sched, src, cur_backend_id); + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; @@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - bool supported = buffer_supported(sched, src, cur_backend_id); + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && !supported) { // create a copy of the input in the split's backend const size_t id = hash_id(src); @@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->i_end = graph->n_nodes; sched->n_splits = i_split + 1; } -#ifdef DEBUG_PASS4 - fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif - // create copies of the graph for each split - // TODO: avoid this copy + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, graph); + } // swap node_backend_ids and leaf_backend_ids and prevs { @@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } + // create copies of the graph for each split + // TODO: avoid this copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; @@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); diff --git a/ggml-blas.h b/ggml-blas.h index 646ca84ef..f2e37de06 100644 --- a/ggml-blas.h +++ b/ggml-blas.h @@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); // number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); diff --git a/llama.cpp b/llama.cpp index 7e76c022b..225ea977f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; - //if (batch.n_tokens < 32 || full_offload) { - // if (il != -1 && strcmp(name, "norm") == 0) { - // for (auto * backend : lctx.backends) { - // if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - // ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - // break; - // } - // } - // } - //} + const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + if (batch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) && + (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } + } + } + } }; struct ggml_cgraph * result = NULL; @@ -12261,17 +12262,6 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - // n_threads = std::min(4, n_threads); - //} - ggml_backend_sched_alloc_graph(lctx.sched, gf); llama_set_inputs(lctx, u_batch);