cuda : fix vmm pool with multi GPU
This commit is contained in:
parent
753be377b6
commit
32dc09aab3
1 changed files with 7 additions and 6 deletions
13
ggml-cuda.cu
13
ggml-cuda.cu
|
@ -6647,7 +6647,6 @@ static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
|
|
||||||
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||||
|
@ -6687,6 +6686,9 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
|
||||||
// map at the end of the pool
|
// map at the end of the pool
|
||||||
CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
|
CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
|
||||||
|
|
||||||
|
// the memory allocation handle is no longer needed after mapping
|
||||||
|
CU_CHECK(cuMemRelease(handle));
|
||||||
|
|
||||||
// set access
|
// set access
|
||||||
CUmemAccessDesc access = {};
|
CUmemAccessDesc access = {};
|
||||||
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
@ -6695,7 +6697,6 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
|
||||||
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
|
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
|
||||||
|
|
||||||
// add to the pool
|
// add to the pool
|
||||||
g_cuda_pool_handles[id].push_back(handle);
|
|
||||||
g_cuda_pool_size[id] += reserve_size;
|
g_cuda_pool_size[id] += reserve_size;
|
||||||
|
|
||||||
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||||
|
@ -8193,13 +8194,13 @@ static void ggml_cuda_op_mul_mat(
|
||||||
if (id != g_main_device) {
|
if (id != g_main_device) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
|
||||||
cudaMemcpyDeviceToDevice, stream));
|
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
||||||
} else {
|
} else {
|
||||||
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
||||||
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
|
||||||
cudaMemcpyDeviceToDevice, stream));
|
src1_ncols*ne10*sizeof(float), stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue