diff --git a/ggml-backend.c b/ggml-backend.c index 31f8d5a6d..7429a1f44 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1696,7 +1696,7 @@ ggml_backend_sched_t ggml_backend_sched_new( bool parallel) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); - GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU + GGML_ASSERT(ggml_backend_buft_is_host(ggml_backend_get_default_buffer_type(backends[n_backends - 1]))); // last backend must be host struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp index 95dcb0fd3..6e12e93f5 100644 --- a/ggml-mpi.cpp +++ b/ggml-mpi.cpp @@ -548,7 +548,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t if (!ctx->remote) { ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), - (int) ctx->backends.size(), cgraph->n_nodes); + (int) ctx->backends.size(), cgraph->n_nodes, false); ggml_backend_sched_reserve(sched, cgraph); ggml_backend_sched_graph_compute(sched, cgraph); @@ -850,7 +850,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer return buffer; } -bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) { +bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { // int src_rank = ggml_backend_mpi_buffer_rank(src->buffer); // int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer); // @@ -870,7 +870,8 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml // } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){ // ggml_mpi_tensor_recv(dst, src_rank, ctx->comm); // } - return true; +// fprintf(stderr, "ATTEMPTING ASYNC COPY FOR SRC TENSOR %s TO DST TENSOR %s WITH SRC BACKEND %s AND DST BACKEND %s\n", src->name, dst->name, ggml_backend_name(backend_src), ggml_backend_name(backend_dst)); + return false; } diff --git a/llama.cpp b/llama.cpp index 96b2adbbe..6ffd905fd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9012,13 +9012,15 @@ static int llama_decode_internal( // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} +#ifdef GGML_USE_MPI + if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) { +#endif + // extract logits // TODO: do not compute and extract logits if only embeddings are needed // update the graphs to skip "result_output" if logits are not needed if (res) { - #ifdef GGML_USE_MPI - if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) { -#endif + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); GGML_ASSERT(backend_res != nullptr); @@ -9104,6 +9106,10 @@ static int llama_decode_internal( } break; } } + +#ifdef GGML_USE_MPI + } +#endif } // wait for the computation to finish (automatically done when obtaining the model output) @@ -9121,9 +9127,7 @@ static int llama_decode_internal( } } -#ifdef GGML_USE_MPI - } -#endif + return 0; } @@ -13051,7 +13055,7 @@ struct llama_context * llama_new_context_with_model( // ctx->backend_cpu = ctx->backends.back(); - ctx->backends.push_back(ctx->backend_cpu); + ctx->backends.push_back(ggml_backend_mpi_init(&ctx->backend_cpu, 1, ggml_mpi_rank(model->ctx_mpi))); #endif