diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a1fec0fd7..6dbbc7950 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -383,7 +383,7 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -402,7 +402,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } -static void ggml_cuda_pool_free(void * ptr, size_t size) { +void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { diff --git a/ggml-cuda.h b/ggml-cuda.h index ab2c690b0..b46a804c7 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -14,6 +14,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens // TODO: export these with GGML_API void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); +void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); +void ggml_cuda_pool_free(void * ptr, size_t size); void ggml_cuda_transform_tensor(struct ggml_tensor * tensor); void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets); diff --git a/llama.cpp b/llama.cpp index 91653f374..38de0e39d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10,6 +10,7 @@ #include "ggml.h" #ifdef GGML_USE_CUBLAS +#include #include "ggml-cuda.h" #endif