This commit is contained in:
Yavor Ivanov 2024-03-17 03:14:42 -07:00 committed by GitHub
commit ca7a2f81b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 26 additions and 3 deletions

View file

@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD "llama: use max release threshold for memory pool" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
@ -386,6 +387,9 @@ if (LLAMA_CUBLAS)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD)
add_compile_definitions(GGML_CUDA_MEMORY_POOL_USE_MAX_RELEASE_THRESHOLD)
endif()
if (LLAMA_STATIC)
if (WIN32)

View file

@ -435,6 +435,9 @@ else ifdef LLAMA_CUDA_DMMV_Y
else
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # LLAMA_CUDA_MMV_Y
ifdef LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
MK_NVCCFLAGS += -DGGML_CUDA_USE_MAX_RELEASE_THRESHOLD
endif # LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
ifdef LLAMA_CUDA_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_F16

View file

@ -166,6 +166,11 @@
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
// The release threshold specifies the maximum amount of memory the CUDA memory pool caches
// Default value is 0. This means all unused memory is released back to the OS on every synchronization operation
// Define to use the maximum release threshold. Recommended when a single proces uses the GPU device
// #define GGML_CUDA_USE_MAX_RELEASE_THRESHOLD
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
@ -10704,7 +10709,9 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
CUDA_CHECK(cudaFree(ctx->dev_ptr));
cudaStream_t main_stream = g_cudaStreams[ctx->device][0];
CUDA_CHECK(cudaFreeAsync(ctx->dev_ptr, main_stream));
CUDA_CHECK(cudaStreamSynchronize(main_stream));
delete ctx;
}
@ -10818,10 +10825,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
cudaStream_t main_stream = g_cudaStreams[buft_ctx->device][0];
void * dev_ptr;
cudaError_t err = cudaMalloc(&dev_ptr, size);
cudaError_t err = cudaMallocAsync(&dev_ptr, size, main_stream);
cudaStreamSynchronize(main_stream);
if (err != cudaSuccess) {
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMallocAsync failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr;
}
@ -11606,6 +11615,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
/* .context = */ ctx
};
#if defined(GGML_CUDA_USE_MAX_RELEASE_THRESHOLD)
uint64_t release_threshold = UINT64_MAX;
cudaMemPool_t default_mem_pool;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&default_mem_pool, device));
CUDA_CHECK(cudaMemPoolSetAttribute(default_mem_pool, cudaMemPoolAttrReleaseThreshold, &release_threshold));
#endif
return cuda_backend;
}