use cudaMemcpy3DPeerAsync
This commit is contained in:
parent
1659cd1bc4
commit
865d042d56
1 changed files with 17 additions and 11 deletions
28
ggml-cuda.cu
28
ggml-cuda.cu
|
@ -68,9 +68,9 @@
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
#endif
|
#endif
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
@ -8258,17 +8258,23 @@ static void ggml_cuda_op_mul_mat(
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||||
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) {
|
if (kind == cudaMemcpyDeviceToDevice && id != g_main_device) {
|
||||||
// there is no cudaMemcpy2DPeerAsync so we need to copy each row separately
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||||
for (int64_t i = 0; i < src1_ncols; ++i) {
|
cudaMemcpy3DPeerParms p = {};
|
||||||
CUDA_CHECK(cudaMemcpyPeerAsync(dhf_dst_i + i*ne0, g_main_device,
|
p.dstDevice = g_main_device;
|
||||||
dst_dd_i + i*row_diff, id,
|
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), ne0, src1_ncols);
|
||||||
row_diff*sizeof(float), stream));
|
p.srcDevice = id;
|
||||||
}
|
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
||||||
} else {
|
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
||||||
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
||||||
row_diff*sizeof(float), src1_ncols, kind, stream));
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
||||||
|
dst_dd_i, row_diff*sizeof(float),
|
||||||
|
row_diff*sizeof(float), src1_ncols,
|
||||||
|
kind, stream));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue