also duplicate gpu compute buffers to avoid races
This commit is contained in:
parent
a97198747f
commit
16e12ab734
2 changed files with 64 additions and 26 deletions
24
ggml-cuda.cu
24
ggml-cuda.cu
|
@ -10842,8 +10842,11 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
|||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
||||
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
// host -> device
|
||||
if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
if (ggml_backend_buffer_is_cuda_host(buf_src) && ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
||||
// make sure the data is ready on the source backend
|
||||
// the CPU backend does not support async compute, so this does nothing at the moment
|
||||
|
@ -10854,7 +10857,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||
}
|
||||
|
||||
// device -> host
|
||||
if (ggml_backend_buffer_is_cuda_host(dst->buffer) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
if (ggml_backend_buffer_is_cuda_host(buf_dst) && ggml_backend_buffer_is_cuda(buf_src)) {
|
||||
// this shoudln't happen currently because the dst backend is our own backend, which does not support host buffers
|
||||
GGML_ASSERT(false);
|
||||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
||||
|
@ -10875,9 +10878,14 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
||||
|
||||
if (backend_src != backend_dst) {
|
||||
//printf("async copy between devices %s, %d -> %d\n", src->name, cuda_ctx_src->device, cuda_ctx_dst->device);
|
||||
cudaDeviceSynchronize();
|
||||
// TODO: reuse event?
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||
|
||||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
||||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
|
||||
cudaEvent_t event;
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
||||
|
||||
|
@ -10885,12 +10893,16 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||
CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));
|
||||
|
||||
// wait on dst stream
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));
|
||||
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_dst->device][0]));
|
||||
|
||||
CUDA_CHECK(cudaEventDestroy(event));
|
||||
}
|
||||
} else {
|
||||
// copy
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
62
llama.cpp
62
llama.cpp
|
@ -1663,6 +1663,16 @@ struct llama_model {
|
|||
struct llama_context {
|
||||
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
||||
~llama_context() {
|
||||
for (auto & it : bufs_compute) {
|
||||
// restore the original buffer in the tallocr
|
||||
ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr(sched, it.first);
|
||||
ggml_tallocr_set_buffer(allocr, it.second[0]);
|
||||
// free the rest of the buffers
|
||||
for (size_t i = 1; i < it.second.size(); ++i) {
|
||||
ggml_backend_buffer_free(it.second[i]);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_sched_free(sched);
|
||||
|
||||
for (ggml_backend_t backend : backends) {
|
||||
|
@ -1670,6 +1680,7 @@ struct llama_context {
|
|||
}
|
||||
|
||||
ggml_backend_buffer_free(buf_logits);
|
||||
|
||||
}
|
||||
|
||||
llama_cparams cparams;
|
||||
|
@ -1719,10 +1730,11 @@ struct llama_context {
|
|||
std::vector<uint8_t> buf_compute_meta;
|
||||
ggml_backend_sched_t sched = nullptr;
|
||||
// allocator for the input tensors
|
||||
ggml_tallocr * alloc_cpu = nullptr;
|
||||
ggml_tallocr_t alloc_cpu = nullptr;
|
||||
|
||||
std::vector<ggml_backend_buffer_t> buf_cpu_ub;
|
||||
size_t buf_cpu_ub_cur = 0;
|
||||
std::map<ggml_backend_t, std::vector<ggml_backend_buffer_t>> bufs_compute;
|
||||
size_t n_compute_bufs = 0;
|
||||
size_t i_compute_buf = 0;
|
||||
|
||||
// temporary buffer for copying data to/from the backend
|
||||
std::vector<no_init<uint8_t>> buf_copy;
|
||||
|
@ -6704,15 +6716,17 @@ static int llama_decode_internal(
|
|||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||
|
||||
// change the CPU compute buffer to avoid overwriting inputs
|
||||
size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur;
|
||||
lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1) % lctx.buf_cpu_ub.size();
|
||||
if (buf_cpu_ub_cur == 0 && cur_token > 0) {
|
||||
size_t i_compute_buf = lctx.i_compute_buf;
|
||||
lctx.i_compute_buf = (lctx.i_compute_buf + 1) % lctx.n_compute_bufs;
|
||||
if (i_compute_buf == 0 && cur_token > 0) {
|
||||
// sync all backends to ensure that the current buffer is not in use
|
||||
printf("not enough buffers, syncing now\n");
|
||||
ggml_backend_sched_synchronize(lctx.sched);
|
||||
}
|
||||
|
||||
ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub.at(buf_cpu_ub_cur));
|
||||
for (auto it : lctx.bufs_compute) {
|
||||
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(lctx.sched, it.first);
|
||||
ggml_tallocr_set_buffer(alloc, it.second.at(i_compute_buf));
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
|
||||
|
@ -6833,7 +6847,7 @@ static int llama_decode_internal(
|
|||
}
|
||||
|
||||
ggml_backend_sched_synchronize(lctx.sched);
|
||||
lctx.buf_cpu_ub_cur = 0;
|
||||
lctx.i_compute_buf = 0;
|
||||
|
||||
// measure the performance only for the single-token evals
|
||||
if (n_tokens_all == 1) {
|
||||
|
@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
// duplicate cpu buffers for microbatching
|
||||
ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
|
||||
size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
|
||||
ctx->buf_cpu_ub.push_back(buf_cpu);
|
||||
int n_ub = 64;
|
||||
const int n_ub = 16;
|
||||
ctx->n_compute_bufs = n_ub;
|
||||
|
||||
for (ggml_backend_t b : ctx->backends) {
|
||||
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
|
||||
ggml_backend_buffer_t buf = ggml_tallocr_get_buffer(alloc);
|
||||
size_t buf_size = ggml_backend_buffer_get_size(buf);
|
||||
ctx->bufs_compute[b].push_back(buf);
|
||||
auto * buft = ggml_backend_buffer_get_type(buf);
|
||||
for (int i = 1; i < n_ub; ++i) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
|
||||
ctx->buf_cpu_ub.push_back(buf);
|
||||
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
||||
if (buf == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffer\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->bufs_compute[b].push_back(buf);
|
||||
}
|
||||
}
|
||||
|
||||
// allocate buffer for logits output
|
||||
ctx->buf_logits = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), hparams.n_vocab*cparams.n_ctx*sizeof(float));
|
||||
if (ctx->buf_logits == nullptr) {
|
||||
|
@ -10816,13 +10842,13 @@ int32_t llama_decode(
|
|||
|
||||
float * llama_get_logits(struct llama_context * ctx) {
|
||||
ggml_backend_sched_synchronize(ctx->sched);
|
||||
ctx->buf_cpu_ub_cur = 0;
|
||||
ctx->i_compute_buf = 0;
|
||||
return ctx->logits;
|
||||
}
|
||||
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
ggml_backend_sched_synchronize(ctx->sched);
|
||||
ctx->buf_cpu_ub_cur = 0;
|
||||
ctx->i_compute_buf = 0;
|
||||
|
||||
assert(ctx->logits_valid.at(i));
|
||||
return ctx->logits + i*ctx->model.hparams.n_vocab;
|
||||
|
@ -10830,7 +10856,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|||
|
||||
float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
ggml_backend_sched_synchronize(ctx->sched);
|
||||
ctx->buf_cpu_ub_cur = 0;
|
||||
ctx->i_compute_buf = 0;
|
||||
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue