also duplicate gpu compute buffers to avoid races

This commit is contained in:
slaren 2024-01-20 18:52:33 +01:00
parent a97198747f
commit 16e12ab734
2 changed files with 64 additions and 26 deletions

View file

@ -10842,8 +10842,11 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
// host -> device
if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
if (ggml_backend_buffer_is_cuda_host(buf_src) && ggml_backend_buffer_is_cuda(buf_dst)) {
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
// make sure the data is ready on the source backend
// the CPU backend does not support async compute, so this does nothing at the moment
@ -10854,7 +10857,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
// device -> host
if (ggml_backend_buffer_is_cuda_host(dst->buffer) && ggml_backend_buffer_is_cuda(src->buffer)) {
if (ggml_backend_buffer_is_cuda_host(buf_dst) && ggml_backend_buffer_is_cuda(buf_src)) {
// this shoudln't happen currently because the dst backend is our own backend, which does not support host buffers
GGML_ASSERT(false);
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
@ -10875,9 +10878,14 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
if (backend_src != backend_dst) {
//printf("async copy between devices %s, %d -> %d\n", src->name, cuda_ctx_src->device, cuda_ctx_dst->device);
cudaDeviceSynchronize();
// TODO: reuse event?
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
ggml_cuda_set_device(cuda_ctx_src->device);
cudaEvent_t event;
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@ -10885,12 +10893,16 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));
// wait on dst stream
ggml_cuda_set_device(cuda_ctx_dst->device);
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_dst->device][0]));
CUDA_CHECK(cudaEventDestroy(event));
} else {
// copy
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
}
// copy
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
return true;
}

View file

@ -1663,6 +1663,16 @@ struct llama_model {
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context() {
for (auto & it : bufs_compute) {
// restore the original buffer in the tallocr
ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr(sched, it.first);
ggml_tallocr_set_buffer(allocr, it.second[0]);
// free the rest of the buffers
for (size_t i = 1; i < it.second.size(); ++i) {
ggml_backend_buffer_free(it.second[i]);
}
}
ggml_backend_sched_free(sched);
for (ggml_backend_t backend : backends) {
@ -1670,6 +1680,7 @@ struct llama_context {
}
ggml_backend_buffer_free(buf_logits);
}
llama_cparams cparams;
@ -1719,10 +1730,11 @@ struct llama_context {
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr;
// allocator for the input tensors
ggml_tallocr * alloc_cpu = nullptr;
ggml_tallocr_t alloc_cpu = nullptr;
std::vector<ggml_backend_buffer_t> buf_cpu_ub;
size_t buf_cpu_ub_cur = 0;
std::map<ggml_backend_t, std::vector<ggml_backend_buffer_t>> bufs_compute;
size_t n_compute_bufs = 0;
size_t i_compute_buf = 0;
// temporary buffer for copying data to/from the backend
std::vector<no_init<uint8_t>> buf_copy;
@ -6704,15 +6716,17 @@ static int llama_decode_internal(
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
// change the CPU compute buffer to avoid overwriting inputs
size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur;
lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1) % lctx.buf_cpu_ub.size();
if (buf_cpu_ub_cur == 0 && cur_token > 0) {
size_t i_compute_buf = lctx.i_compute_buf;
lctx.i_compute_buf = (lctx.i_compute_buf + 1) % lctx.n_compute_bufs;
if (i_compute_buf == 0 && cur_token > 0) {
// sync all backends to ensure that the current buffer is not in use
printf("not enough buffers, syncing now\n");
ggml_backend_sched_synchronize(lctx.sched);
}
ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub.at(buf_cpu_ub_cur));
for (auto it : lctx.bufs_compute) {
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(lctx.sched, it.first);
ggml_tallocr_set_buffer(alloc, it.second.at(i_compute_buf));
}
ggml_backend_sched_reset(lctx.sched);
@ -6833,7 +6847,7 @@ static int llama_decode_internal(
}
ggml_backend_sched_synchronize(lctx.sched);
lctx.buf_cpu_ub_cur = 0;
lctx.i_compute_buf = 0;
// measure the performance only for the single-token evals
if (n_tokens_all == 1) {
@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
// duplicate cpu buffers for microbatching
ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
ctx->buf_cpu_ub.push_back(buf_cpu);
int n_ub = 64;
for (int i = 1; i < n_ub; ++i) {
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
ctx->buf_cpu_ub.push_back(buf);
const int n_ub = 16;
ctx->n_compute_bufs = n_ub;
for (ggml_backend_t b : ctx->backends) {
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
ggml_backend_buffer_t buf = ggml_tallocr_get_buffer(alloc);
size_t buf_size = ggml_backend_buffer_get_size(buf);
ctx->bufs_compute[b].push_back(buf);
auto * buft = ggml_backend_buffer_get_type(buf);
for (int i = 1; i < n_ub; ++i) {
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
if (buf == nullptr) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffer\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->bufs_compute[b].push_back(buf);
}
}
// allocate buffer for logits output
ctx->buf_logits = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), hparams.n_vocab*cparams.n_ctx*sizeof(float));
if (ctx->buf_logits == nullptr) {
@ -10816,13 +10842,13 @@ int32_t llama_decode(
float * llama_get_logits(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched);
ctx->buf_cpu_ub_cur = 0;
ctx->i_compute_buf = 0;
return ctx->logits;
}
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
ggml_backend_sched_synchronize(ctx->sched);
ctx->buf_cpu_ub_cur = 0;
ctx->i_compute_buf = 0;
assert(ctx->logits_valid.at(i));
return ctx->logits + i*ctx->model.hparams.n_vocab;
@ -10830,7 +10856,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
float * llama_get_embeddings(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched);
ctx->buf_cpu_ub_cur = 0;
ctx->i_compute_buf = 0;
return ctx->embedding.data();
}