Change requirement of last backend being CPU to requiring its default buffer type be a host buffer, fix rebase errors
This commit is contained in:
parent
e8a61568e9
commit
2217b02c99
3 changed files with 16 additions and 11 deletions
|
@ -1696,7 +1696,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
bool parallel) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||
GGML_ASSERT(ggml_backend_buft_is_host(ggml_backend_get_default_buffer_type(backends[n_backends - 1]))); // last backend must be host
|
||||
|
||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||
|
||||
|
|
|
@ -548,7 +548,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
|
|||
|
||||
if (!ctx->remote) {
|
||||
ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
|
||||
(int) ctx->backends.size(), cgraph->n_nodes);
|
||||
(int) ctx->backends.size(), cgraph->n_nodes, false);
|
||||
|
||||
ggml_backend_sched_reserve(sched, cgraph);
|
||||
ggml_backend_sched_graph_compute(sched, cgraph);
|
||||
|
@ -850,7 +850,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
|
|||
return buffer;
|
||||
}
|
||||
|
||||
bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
// int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
|
||||
// int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
|
||||
//
|
||||
|
@ -870,7 +870,8 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml
|
|||
// } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
|
||||
// ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
|
||||
// }
|
||||
return true;
|
||||
// fprintf(stderr, "ATTEMPTING ASYNC COPY FOR SRC TENSOR %s TO DST TENSOR %s WITH SRC BACKEND %s AND DST BACKEND %s\n", src->name, dst->name, ggml_backend_name(backend_src), ggml_backend_name(backend_dst));
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
|
18
llama.cpp
18
llama.cpp
|
@ -9012,13 +9012,15 @@ static int llama_decode_internal(
|
|||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
|
||||
#endif
|
||||
|
||||
// extract logits
|
||||
// TODO: do not compute and extract logits if only embeddings are needed
|
||||
// update the graphs to skip "result_output" if logits are not needed
|
||||
if (res) {
|
||||
#ifdef GGML_USE_MPI
|
||||
if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
|
||||
#endif
|
||||
|
||||
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
||||
GGML_ASSERT(backend_res != nullptr);
|
||||
|
@ -9104,6 +9106,10 @@ static int llama_decode_internal(
|
|||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
|
@ -9121,9 +9127,7 @@ static int llama_decode_internal(
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -13051,7 +13055,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
|
||||
// ctx->backend_cpu = ctx->backends.back();
|
||||
ctx->backends.push_back(ctx->backend_cpu);
|
||||
ctx->backends.push_back(ggml_backend_mpi_init(&ctx->backend_cpu, 1, ggml_mpi_rank(model->ctx_mpi)));
|
||||
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue