diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 1db32901f..e1c8c51e1 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -80,7 +80,7 @@ extern "C" { // (optional) asynchronous tensor data access void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations void (*synchronize)(ggml_backend_t backend); diff --git a/ggml-backend.c b/ggml-backend.c index 505dbba47..c3f9a0f6f 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -279,24 +279,24 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst } } -void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { +void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); if (src == dst) { return; } - if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { - if (backend->iface.cpy_tensor_async != NULL) { - if (backend->iface.cpy_tensor_async(backend, src, dst)) { - return; - } + if (backend_dst->iface.cpy_tensor_async != NULL) { + if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) { + return; } } size_t nbytes = ggml_nbytes(src); if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); + // wait for src to be ready before copy + ggml_backend_synchronize(backend_src); + ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, nbytes); } else { ggml_backend_tensor_copy(src, dst); @@ -1304,6 +1304,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { // copy the input tensors to the split backend uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { + ggml_backend_t input_backend = get_allocr_backend(sched, node_allocr(split->inputs[j])); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; @@ -1312,7 +1313,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times - ggml_backend_tensor_copy_async(split_backend, input, input_cpy); + ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } //ggml_backend_synchronize(split_backend); // necessary to measure copy time int64_t copy_end_us = ggml_time_us(); diff --git a/ggml-backend.h b/ggml-backend.h index 4eb244af1..1c56e5785 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -72,7 +72,7 @@ extern "C" { // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy + GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy // // CPU backend diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd3814c72..ca9cdec4d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10763,11 +10763,32 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; +static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { + if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { + return false; + } - if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0])); + if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { + return false; + } + + ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; + ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; + + if (backend_src == backend_dst) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + } else { + cudaEvent_t event; + CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + // record event on src stream + CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0])); + // wait on dst stream + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0)); + // copy + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + + CUDA_CHECK(cudaEventDestroy(event)); return true; }