Add CUDA option to use the memory pool max release threshold
This commit is contained in:
parent
41f308f58e
commit
72a9f4ea8c
3 changed files with 26 additions and 3 deletions
|
@ -96,6 +96,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
|||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD "llama: use max release threshold for memory pool" OFF)
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
@ -349,6 +350,9 @@ if (LLAMA_CUBLAS)
|
|||
endif()
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
if (LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD)
|
||||
add_compile_definitions(GGML_CUDA_MEMORY_POOL_USE_MAX_RELEASE_THRESHOLD)
|
||||
endif()
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
if (WIN32)
|
||||
|
|
3
Makefile
3
Makefile
|
@ -413,6 +413,9 @@ else ifdef LLAMA_CUDA_DMMV_Y
|
|||
else
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||
endif # LLAMA_CUDA_MMV_Y
|
||||
ifdef LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_USE_MAX_RELEASE_THRESHOLD
|
||||
endif # LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
|
||||
ifdef LLAMA_CUDA_F16
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_F16
|
||||
|
|
22
ggml-cuda.cu
22
ggml-cuda.cu
|
@ -153,6 +153,11 @@
|
|||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
// The release threshold specifies the maximum amount of memory the CUDA memory pool caches
|
||||
// Default value is 0. This means all unused memory is released back to the OS on every synchronization operation
|
||||
// Define to use the maximum release threshold. Recommended when a single proces uses the GPU device
|
||||
// #define GGML_CUDA_USE_MAX_RELEASE_THRESHOLD
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
|
@ -10628,7 +10633,9 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
|
|||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||
cudaStream_t main_stream = g_cudaStreams[ctx->device][0];
|
||||
CUDA_CHECK(cudaFreeAsync(ctx->dev_ptr, main_stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(main_stream));
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
|
@ -10744,10 +10751,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|||
|
||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
cudaStream_t main_stream = g_cudaStreams[buft_ctx->device][0];
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
cudaError_t err = cudaMallocAsync(&dev_ptr, size, main_stream);
|
||||
cudaStreamSynchronize(main_stream);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMallocAsync failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -11415,6 +11424,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
#if defined(GGML_CUDA_USE_MAX_RELEASE_THRESHOLD)
|
||||
uint64_t release_threshold = UINT64_MAX;
|
||||
cudaMemPool_t default_mem_pool;
|
||||
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&default_mem_pool, device));
|
||||
CUDA_CHECK(cudaMemPoolSetAttribute(default_mem_pool, cudaMemPoolAttrReleaseThreshold, &release_threshold));
|
||||
#endif
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue