sched : print assignments when GGML_SCHED_DEBUG env variable is set
This commit is contained in:
parent
a8a1bf7981
commit
ecb75b5f54
3 changed files with 30 additions and 50 deletions
|
@ -1078,6 +1078,8 @@ struct ggml_backend_sched {
|
|||
ggml_backend_sched_eval_callback callback_eval;
|
||||
void * callback_eval_user_data;
|
||||
|
||||
bool debug;
|
||||
|
||||
// align context_buffer to GGML_MEM_ALIGN
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(GGML_MEM_ALIGN))
|
||||
|
@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
|
|||
#define GET_CAUSE(node) ""
|
||||
#endif
|
||||
|
||||
//#define DEBUG_PASS1
|
||||
//#define DEBUG_PASS2
|
||||
//#define DEBUG_PASS3
|
||||
//#define DEBUG_PASS4
|
||||
|
||||
// returns the backend that should be used for the node based on the current locations
|
||||
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
||||
// TODO: use supports_op to check if the backend supports the op
|
||||
|
@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||
}
|
||||
}
|
||||
|
||||
static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
||||
static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
||||
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.2");
|
||||
|
@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node
|
|||
return cur_backend_id;
|
||||
}
|
||||
|
||||
static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
|
||||
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
|
||||
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
||||
ggml_backend_buffer_type_t buft = NULL;
|
||||
|
||||
|
@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_PASS1
|
||||
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// pass 2: expand current backend assignments
|
||||
// assign the same backend to adjacent nodes
|
||||
|
@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
} else if (cur_backend_id != -1) {
|
||||
// FIXME: clean this
|
||||
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
if (cur_backend_id == sched->n_backends - 1) {
|
||||
// skip cpu (lowest prio backend)
|
||||
cur_backend_id = -1;
|
||||
|
@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
cur_backend_id = *node_backend_id;
|
||||
}
|
||||
} else if (cur_backend_id != -1) {
|
||||
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
if (cur_backend_id == sched->n_backends - 1) {
|
||||
// skip cpu (lowest prio backend)
|
||||
cur_backend_id = -1;
|
||||
|
@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else if (cur_backend_id != -1) {
|
||||
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else if (cur_backend_id != -1) {
|
||||
cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PASS2
|
||||
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// pass 3: assign backends to remaining src from dst and view_src
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_PASS3
|
||||
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
{
|
||||
|
@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||
const size_t id = hash_id(src);
|
||||
int src_backend_id = sched->tensor_backend_id[id];
|
||||
bool supported = buffer_supported(sched, src, cur_backend_id);
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
|
@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
|
||||
bool supported = buffer_supported(sched, src, cur_backend_id);
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && !supported) {
|
||||
// create a copy of the input in the split's backend
|
||||
const size_t id = hash_id(src);
|
||||
|
@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
split->i_end = graph->n_nodes;
|
||||
sched->n_splits = i_split + 1;
|
||||
}
|
||||
#ifdef DEBUG_PASS4
|
||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
if (sched->debug) {
|
||||
ggml_backend_sched_print_assignments(sched, graph);
|
||||
}
|
||||
|
||||
// swap node_backend_ids and leaf_backend_ids and prevs
|
||||
{
|
||||
|
@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||
|
@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
|
||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||
|
|
|
@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
|
|||
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for conversion to float
|
||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
|
||||
|
||||
|
|
34
llama.cpp
34
llama.cpp
|
@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
|
||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||
// FIXME: fix in ggml_backend_sched
|
||||
//const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||
//if (batch.n_tokens < 32 || full_offload) {
|
||||
// if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
// for (auto * backend : lctx.backends) {
|
||||
// if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||
// ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||
if (batch.n_tokens < 32 || full_offload) {
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
for (auto * backend : lctx.backends) {
|
||||
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
||||
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cgraph * result = NULL;
|
||||
|
@ -12261,17 +12262,6 @@ static int llama_decode_internal(
|
|||
}
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||
// with the BLAS calls. need a better solution
|
||||
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
||||
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
||||
//if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||
// n_threads = std::min(4, n_threads);
|
||||
//}
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||
|
||||
llama_set_inputs(lctx, u_batch);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue