From 865d042d56b846068cb15906047fa78491fa8b79 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 25 Dec 2023 19:07:32 +0100 Subject: [PATCH] use cudaMemcpy3DPeerAsync --- ggml-cuda.cu | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7de079079..240e2be8c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -68,9 +68,9 @@ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #endif #define cudaMemcpy hipMemcpy -#define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyPeerAsync hipMemcpyPeerAsync +#define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice @@ -8258,17 +8258,23 @@ static void ggml_cuda_op_mul_mat( float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0 + row_low[id]; - +#if !defined(GGML_USE_HIPBLAS) if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) { - // there is no cudaMemcpy2DPeerAsync so we need to copy each row separately - for (int64_t i = 0; i < src1_ncols; ++i) { - CUDA_CHECK(cudaMemcpyPeerAsync(dhf_dst_i + i*ne0, g_main_device, - dst_dd_i + i*row_diff, id, - row_diff*sizeof(float), stream)); - } - } else { - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float), - row_diff*sizeof(float), src1_ncols, kind, stream)); + // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = g_main_device; + p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), ne0, src1_ncols); + p.srcDevice = id; + p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); + p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); + CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); + } else +#endif + { + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), + dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, + kind, stream)); } } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);