fixes
This commit is contained in:
parent
ecb75b5f54
commit
e06659811e
4 changed files with 80 additions and 31 deletions
|
@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
params.output_format = cmd_params_defaults.output_format;
|
||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
params.numa = cmd_params_defaults.numa;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
|
|
@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||
static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
||||
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.2");
|
||||
SET_CAUSE(node, "2.1");
|
||||
} else {
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
if (b == cur_backend_id) {
|
||||
|
@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
||||
|
||||
|
||||
// pass 2.2 expand gpu down
|
||||
// expand gpu down
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
|
@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
}
|
||||
// pass 2.1 expand gpu up
|
||||
// expand gpu up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
|
@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
}
|
||||
// pass 2.4 expand rest down
|
||||
// expand rest down
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
|
@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
}
|
||||
// pass 2.3 expand rest up
|
||||
// expand rest up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
|
@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
}
|
||||
}
|
||||
|
||||
// pass 3: assign backends to remaining src from dst and view_src
|
||||
// pass 3
|
||||
// upgrade nodes to higher prio backends with compatible buffer types
|
||||
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
||||
// however, we also need to verify that the sources are in compatible buffer types
|
||||
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
||||
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
||||
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
for (int b = 0; b < *node_backend_id; b++) {
|
||||
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
||||
bool supported = true;
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
||||
supported = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (supported) {
|
||||
*node_backend_id = b;
|
||||
SET_CAUSE(node, "3.upg");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pass 4: assign backends to remaining src from dst and view_src
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
int * cur_backend_id = &tensor_backend_id(node);
|
||||
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||
SET_CAUSE(node, "3.vsrc");
|
||||
SET_CAUSE(node, "4.vsrc");
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
|
@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
if (src->view_src != NULL) {
|
||||
// views are always on the same backend as the source
|
||||
*src_backend_id = tensor_backend_id(src->view_src);
|
||||
SET_CAUSE(src, "3.vsrc");
|
||||
SET_CAUSE(src, "4.vsrc");
|
||||
} else {
|
||||
*src_backend_id = *cur_backend_id;
|
||||
SET_CAUSE(src, "3.cur");
|
||||
SET_CAUSE(src, "4.cur");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||
free(sched->tensor_copies);
|
||||
free(sched->node_backend_ids);
|
||||
free(sched->leaf_backend_ids);
|
||||
free(sched->prev_node_backend_ids);
|
||||
free(sched->prev_leaf_backend_ids);
|
||||
free(sched);
|
||||
}
|
||||
|
||||
|
@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
tensor_backend_id(node) = backend_index;
|
||||
SET_CAUSE(node, "usr");
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||
|
|
|
@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
|
@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
|
||||
// convert src0 to float
|
||||
if (type != GGML_TYPE_F32) {
|
||||
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
||||
ggml_to_float_t const to_float = type_traits.to_float;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
|
||||
float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
|
||||
|
||||
const int min_cols_per_thread = 4096;
|
||||
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
|
||||
const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread));
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel for num_threads(ctx->n_threads)
|
||||
#pragma omp parallel for num_threads(n_threads)
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < ctx->n_threads - 1; i++) {
|
||||
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
|
||||
const int64_t start = i*ne01/ctx->n_threads;
|
||||
const int64_t end = (i + 1)*ne01/ctx->n_threads;
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
}
|
||||
}));
|
||||
for (int i = 1; i < n_threads; i++) {
|
||||
const int64_t start = i*ne01/n_threads;
|
||||
const int64_t end = (i + 1)*ne01/n_threads;
|
||||
if (start < end) {
|
||||
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
||||
{
|
||||
// reuse the current thread for the last task
|
||||
const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
|
||||
const int64_t end = ne01;
|
||||
const int64_t start = 0;
|
||||
const int64_t end = ne01/n_threads;
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
}
|
||||
|
@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if defined(OPENBLAS_VERSION)
|
||||
openblas_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
|
||||
x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
|
|
18
ggml.c
18
ggml.c
|
@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||
switch (node->op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ACC:
|
||||
|
@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
|
@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|||
sched_yield();
|
||||
}
|
||||
|
||||
* node_n = atomic_load(&state->shared->node_n);
|
||||
if (* node_n != last_node_n) break;
|
||||
*node_n = atomic_load(&state->shared->node_n);
|
||||
if (*node_n != last_node_n) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
|
@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|||
|
||||
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_task_phase = * task_phase;
|
||||
const int last_task_phase = *task_phase;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
* task_phase = atomic_load(&state->shared->node_task);
|
||||
if (* task_phase != last_task_phase) break;
|
||||
*task_phase = atomic_load(&state->shared->node_task);
|
||||
if (*task_phase != last_task_phase) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue