cuBLAS memory management routines
This commit is contained in:
parent
7fcfba2e28
commit
6e292d70a5
1 changed files with 36 additions and 13 deletions
49
ggml-cuda.cu
49
ggml-cuda.cu
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
// #define CUDA_MEM_DEBUG 1
|
#define CUDA_MEM_DEBUG 0
|
||||||
|
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -181,6 +181,19 @@ static int g_cuda_free_buffer_indices[MAX_CUDA_BUFFERS]; // sorted list of free
|
||||||
static int g_cuda_free_buffer_count = 0;
|
static int g_cuda_free_buffer_count = 0;
|
||||||
static size_t g_cuda_pool_total_allocated = 0;
|
static size_t g_cuda_pool_total_allocated = 0;
|
||||||
|
|
||||||
|
void cuda_pool_dump() {
|
||||||
|
printf("========================================\n");
|
||||||
|
printf("| Current CUDA Buffer Pool |\n");
|
||||||
|
printf("========================================\n");
|
||||||
|
printf("| %-6s | %-12s | %-10s |\n", "Index", "Buffer Index", "Size (bytes)");
|
||||||
|
printf("----------------------------------------\n");
|
||||||
|
for (int i = 0; i < g_cuda_free_buffer_count; ++i) {
|
||||||
|
int buffer_index = g_cuda_free_buffer_indices[i];
|
||||||
|
printf("| %-6d | %-12d | %-10zu |\n",
|
||||||
|
i, buffer_index, g_cuda_buffer_pool[buffer_index].size);
|
||||||
|
}
|
||||||
|
printf("========================================\n");
|
||||||
|
}
|
||||||
|
|
||||||
void cuda_pool_insert_idx(int index) {
|
void cuda_pool_insert_idx(int index) {
|
||||||
int left = 0;
|
int left = 0;
|
||||||
|
@ -188,7 +201,7 @@ void cuda_pool_insert_idx(int index) {
|
||||||
|
|
||||||
while (left <= right) {
|
while (left <= right) {
|
||||||
int mid = (left + right) / 2;
|
int mid = (left + right) / 2;
|
||||||
if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size <= g_cuda_buffer_pool[index].size) {
|
if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size < g_cuda_buffer_pool[index].size) {
|
||||||
left = mid + 1;
|
left = mid + 1;
|
||||||
} else {
|
} else {
|
||||||
right = mid - 1;
|
right = mid - 1;
|
||||||
|
@ -202,8 +215,9 @@ void cuda_pool_insert_idx(int index) {
|
||||||
|
|
||||||
g_cuda_free_buffer_indices[left] = index;
|
g_cuda_free_buffer_indices[left] = index;
|
||||||
g_cuda_free_buffer_count++;
|
g_cuda_free_buffer_count++;
|
||||||
#ifdef CUDA_MEM_DEBUG
|
#if CUDA_MEM_DEBUG
|
||||||
printf("INFO:: Inserted buffer at index %d with size %d bytes\n", index, g_cuda_buffer_pool[index].size);
|
printf("INFO:: Inserted buffer at index %d with size %lld bytes\n", index, g_cuda_buffer_pool[index].size);
|
||||||
|
cuda_pool_dump();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
// find the best fitting block in the pool and remove it from the indexed list
|
// find the best fitting block in the pool and remove it from the indexed list
|
||||||
|
@ -216,7 +230,11 @@ int cuda_pool_get_block(size_t size, size_t * actual_size) {
|
||||||
int mid = (left + right) / 2;
|
int mid = (left + right) / 2;
|
||||||
if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size >= size) {
|
if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size >= size) {
|
||||||
index = mid;
|
index = mid;
|
||||||
right = mid - 1;
|
if (mid > 0 && g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid - 1]].size >= size) {
|
||||||
|
right = mid - 1; // continue searching to the left for smaller fitting buffers
|
||||||
|
} else {
|
||||||
|
break; // found the smallest fitting buffer
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
left = mid + 1;
|
left = mid + 1;
|
||||||
}
|
}
|
||||||
|
@ -231,17 +249,20 @@ int cuda_pool_get_block(size_t size, size_t * actual_size) {
|
||||||
g_cuda_free_buffer_indices[i] = g_cuda_free_buffer_indices[i + 1];
|
g_cuda_free_buffer_indices[i] = g_cuda_free_buffer_indices[i + 1];
|
||||||
}
|
}
|
||||||
g_cuda_free_buffer_count--;
|
g_cuda_free_buffer_count--;
|
||||||
#ifdef CUDA_MEM_DEBUG
|
#if CUDA_MEM_DEBUG
|
||||||
printf("INFO:: Found buffer of size %d bytes at index %d\n", *actual_size, buffer_index);
|
printf("INFO:: Found buffer of size %lld bytes at index %d\n", *actual_size, buffer_index);
|
||||||
|
cuda_pool_dump();
|
||||||
#endif
|
#endif
|
||||||
return buffer_index;
|
return buffer_index;
|
||||||
}
|
}
|
||||||
#ifdef CUDA_MEM_DEBUG
|
#if CUDA_MEM_DEBUG
|
||||||
printf("INFO:: No buffer found for size %d bytes\n", size);
|
printf("INFO:: No buffer found for size %lld bytes\n", size);
|
||||||
|
cuda_pool_dump();
|
||||||
#endif
|
#endif
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// mark all buffers as free
|
// mark all buffers as free
|
||||||
void ggml_cuda_pool_initialize() {
|
void ggml_cuda_pool_initialize() {
|
||||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||||
|
@ -262,8 +283,9 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
g_cuda_buffer_pool[buffer_index].ptr = nullptr;
|
g_cuda_buffer_pool[buffer_index].ptr = nullptr;
|
||||||
g_cuda_buffer_pool[buffer_index].size = 0;
|
g_cuda_buffer_pool[buffer_index].size = 0;
|
||||||
g_cuda_pool_total_allocated -= *actual_size;
|
g_cuda_pool_total_allocated -= *actual_size;
|
||||||
#ifdef CUDA_MEM_DEBUG
|
#if CUDA_MEM_DEBUG
|
||||||
printf("INFO:: Allocated %d bytes from buffer at index %d (total allocated: %d bytes)\n", *actual_size, buffer_index, g_cuda_pool_total_allocated);
|
printf("INFO:: Allocated %lld bytes from buffer at index %d (total allocated: %lld bytes)\n", *actual_size, buffer_index, g_cuda_pool_total_allocated);
|
||||||
|
cuda_pool_dump();
|
||||||
#endif
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
@ -274,8 +296,9 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
|
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
|
||||||
*actual_size = size;
|
*actual_size = size;
|
||||||
g_cuda_pool_total_allocated += size;
|
g_cuda_pool_total_allocated += size;
|
||||||
#ifdef CUDA_MEM_DEBUG
|
#if CUDA_MEM_DEBUG
|
||||||
printf("INFO:: Allocated %d bytes from cudaMalloc (total allocated: %d bytes)\n", *actual_size, g_cuda_pool_total_allocated);
|
printf("INFO:: Allocated %lld bytes from cudaMalloc (total allocated: %lld bytes)\n", *actual_size, g_cuda_pool_total_allocated);
|
||||||
|
cuda_pool_dump();
|
||||||
#endif
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue