use cudaMemcpy3DPeerAsync

This commit is contained in:
slaren 2023-12-25 19:07:32 +01:00
parent 1659cd1bc4
commit 865d042d56

View file

@ -68,9 +68,9 @@
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#endif #endif
#define cudaMemcpy hipMemcpy #define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
@ -8258,17 +8258,23 @@ static void ggml_cuda_op_mul_mat(
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + row_low[id]; dhf_dst_i += src1_col_0*ne0 + row_low[id];
#if !defined(GGML_USE_HIPBLAS)
if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) { if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) {
// there is no cudaMemcpy2DPeerAsync so we need to copy each row separately // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
for (int64_t i = 0; i < src1_ncols; ++i) { cudaMemcpy3DPeerParms p = {};
CUDA_CHECK(cudaMemcpyPeerAsync(dhf_dst_i + i*ne0, g_main_device, p.dstDevice = g_main_device;
dst_dd_i + i*row_diff, id, p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), ne0, src1_ncols);
row_diff*sizeof(float), stream)); p.srcDevice = id;
} p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
} else { p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float), CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
row_diff*sizeof(float), src1_ncols, kind, stream)); } else
#endif
{
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols,
kind, stream));
} }
} else { } else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);