Change requirement of last backend being CPU to requiring its default buffer type be a host buffer, fix rebase errors

This commit is contained in:
Branden Butler 2024-03-14 22:24:54 -05:00
parent e8a61568e9
commit 2217b02c99
3 changed files with 16 additions and 11 deletions

View file

@ -1696,7 +1696,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
bool parallel) { bool parallel) {
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU GGML_ASSERT(ggml_backend_buft_is_host(ggml_backend_get_default_buffer_type(backends[n_backends - 1]))); // last backend must be host
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);

View file

@ -548,7 +548,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
if (!ctx->remote) { if (!ctx->remote) {
ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
(int) ctx->backends.size(), cgraph->n_nodes); (int) ctx->backends.size(), cgraph->n_nodes, false);
ggml_backend_sched_reserve(sched, cgraph); ggml_backend_sched_reserve(sched, cgraph);
ggml_backend_sched_graph_compute(sched, cgraph); ggml_backend_sched_graph_compute(sched, cgraph);
@ -850,7 +850,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
return buffer; return buffer;
} }
bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) { bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
// int src_rank = ggml_backend_mpi_buffer_rank(src->buffer); // int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
// int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer); // int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
// //
@ -870,7 +870,8 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml
// } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){ // } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
// ggml_mpi_tensor_recv(dst, src_rank, ctx->comm); // ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
// } // }
return true; // fprintf(stderr, "ATTEMPTING ASYNC COPY FOR SRC TENSOR %s TO DST TENSOR %s WITH SRC BACKEND %s AND DST BACKEND %s\n", src->name, dst->name, ggml_backend_name(backend_src), ggml_backend_name(backend_dst));
return false;
} }

View file

@ -9012,13 +9012,15 @@ static int llama_decode_internal(
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//} //}
#ifdef GGML_USE_MPI
if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
#endif
// extract logits // extract logits
// TODO: do not compute and extract logits if only embeddings are needed // TODO: do not compute and extract logits if only embeddings are needed
// update the graphs to skip "result_output" if logits are not needed // update the graphs to skip "result_output" if logits are not needed
if (res) { if (res) {
#ifdef GGML_USE_MPI
if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
#endif
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(backend_res != nullptr);
@ -9104,6 +9106,10 @@ static int llama_decode_internal(
} break; } break;
} }
} }
#ifdef GGML_USE_MPI
}
#endif
} }
// wait for the computation to finish (automatically done when obtaining the model output) // wait for the computation to finish (automatically done when obtaining the model output)
@ -9121,9 +9127,7 @@ static int llama_decode_internal(
} }
} }
#ifdef GGML_USE_MPI
}
#endif
return 0; return 0;
} }
@ -13051,7 +13055,7 @@ struct llama_context * llama_new_context_with_model(
// ctx->backend_cpu = ctx->backends.back(); // ctx->backend_cpu = ctx->backends.back();
ctx->backends.push_back(ctx->backend_cpu); ctx->backends.push_back(ggml_backend_mpi_init(&ctx->backend_cpu, 1, ggml_mpi_rank(model->ctx_mpi)));
#endif #endif