CUDA pool is optional now.
This commit is contained in:
parent
863166b4c3
commit
2b0303add6
2 changed files with 15 additions and 4 deletions
|
@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA"
|
||||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -270,6 +271,11 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_USE_CUDA_POOL)
|
||||||
|
add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_USE_CUDA_POOL)
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL)
|
||||||
|
endif()
|
||||||
|
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
|
|
@ -108,7 +108,7 @@
|
||||||
#define CUDA_USE_TENSOR_CORES
|
#define CUDA_USE_TENSOR_CORES
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL)
|
#if defined(GGML_USE_CUDA_MEMORY_POOL)
|
||||||
#define CUDA_USE_MEMORY_POOL
|
#define CUDA_USE_MEMORY_POOL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -503,7 +503,6 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static void * g_scratch_buffer = nullptr;
|
static void * g_scratch_buffer = nullptr;
|
||||||
static size_t g_scratch_size = 0; // disabled by default
|
static size_t g_scratch_size = 0; // disabled by default
|
||||||
static size_t g_scratch_offset = 0;
|
static size_t g_scratch_offset = 0;
|
||||||
static bool g_cudaMutliGpuMemPoolSupported = true;
|
|
||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
|
||||||
|
@ -5814,7 +5813,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
|
|
||||||
|
|
||||||
static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
|
static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
|
||||||
if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported) {
|
if (g_cudaMemPools[id] == nullptr) {
|
||||||
return ggml_cuda_pool_free(ptr, actual_size);
|
return ggml_cuda_pool_free(ptr, actual_size);
|
||||||
}
|
}
|
||||||
CUDA_CHECK(cudaFreeAsync(ptr, stream));
|
CUDA_CHECK(cudaFreeAsync(ptr, stream));
|
||||||
|
@ -5938,7 +5937,9 @@ void ggml_init_cublas() {
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n");
|
"WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n");
|
||||||
g_cudaMutliGpuMemPoolSupported = false;
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
g_cudaMemPools[id] = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue