diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c49d645c..ed92a6c68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA" #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) +option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -270,6 +271,11 @@ if (LLAMA_CUBLAS) if (LLAMA_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + + if (LLAMA_CUDA_USE_CUDA_POOL) + add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL) + endif() + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS) if (LLAMA_CUDA_FORCE_MMQ) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_USE_CUDA_POOL) + target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL) + endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bed020394..83da27c7f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -108,7 +108,7 @@ #define CUDA_USE_TENSOR_CORES #endif -#if !defined(GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL) +#if defined(GGML_USE_CUDA_MEMORY_POOL) #define CUDA_USE_MEMORY_POOL #endif @@ -503,7 +503,6 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; -static bool g_cudaMutliGpuMemPoolSupported = true; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -5814,7 +5813,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { - if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported) { + if (g_cudaMemPools[id] == nullptr) { return ggml_cuda_pool_free(ptr, actual_size); } CUDA_CHECK(cudaFreeAsync(ptr, stream)); @@ -5938,7 +5937,9 @@ void ggml_init_cublas() { } else { fprintf(stderr, "WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n"); - g_cudaMutliGpuMemPoolSupported = false; + for (int id = 0; id < g_device_count; ++id) { + g_cudaMemPools[id] = nullptr; + } } } #endif