diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9b32dc0e0..92ae3fb0c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6,7 +6,7 @@ typedef uint16_t ggml_fp16_t; static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); -// #define CUDA_MEM_DEBUG 1 + #define CUDA_MEM_DEBUG 0 #define QK4_0 32 typedef struct { @@ -181,6 +181,19 @@ static int g_cuda_free_buffer_indices[MAX_CUDA_BUFFERS]; // sorted list of free static int g_cuda_free_buffer_count = 0; static size_t g_cuda_pool_total_allocated = 0; +void cuda_pool_dump() { + printf("========================================\n"); + printf("| Current CUDA Buffer Pool |\n"); + printf("========================================\n"); + printf("| %-6s | %-12s | %-10s |\n", "Index", "Buffer Index", "Size (bytes)"); + printf("----------------------------------------\n"); + for (int i = 0; i < g_cuda_free_buffer_count; ++i) { + int buffer_index = g_cuda_free_buffer_indices[i]; + printf("| %-6d | %-12d | %-10zu |\n", + i, buffer_index, g_cuda_buffer_pool[buffer_index].size); + } + printf("========================================\n"); +} void cuda_pool_insert_idx(int index) { int left = 0; @@ -188,7 +201,7 @@ void cuda_pool_insert_idx(int index) { while (left <= right) { int mid = (left + right) / 2; - if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size <= g_cuda_buffer_pool[index].size) { + if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size < g_cuda_buffer_pool[index].size) { left = mid + 1; } else { right = mid - 1; @@ -202,8 +215,9 @@ void cuda_pool_insert_idx(int index) { g_cuda_free_buffer_indices[left] = index; g_cuda_free_buffer_count++; - #ifdef CUDA_MEM_DEBUG - printf("INFO:: Inserted buffer at index %d with size %d bytes\n", index, g_cuda_buffer_pool[index].size); + #if CUDA_MEM_DEBUG + printf("INFO:: Inserted buffer at index %d with size %lld bytes\n", index, g_cuda_buffer_pool[index].size); + cuda_pool_dump(); #endif } // find the best fitting block in the pool and remove it from the indexed list @@ -216,7 +230,11 @@ int cuda_pool_get_block(size_t size, size_t * actual_size) { int mid = (left + right) / 2; if (g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid]].size >= size) { index = mid; - right = mid - 1; + if (mid > 0 && g_cuda_buffer_pool[g_cuda_free_buffer_indices[mid - 1]].size >= size) { + right = mid - 1; // continue searching to the left for smaller fitting buffers + } else { + break; // found the smallest fitting buffer + } } else { left = mid + 1; } @@ -231,17 +249,20 @@ int cuda_pool_get_block(size_t size, size_t * actual_size) { g_cuda_free_buffer_indices[i] = g_cuda_free_buffer_indices[i + 1]; } g_cuda_free_buffer_count--; - #ifdef CUDA_MEM_DEBUG - printf("INFO:: Found buffer of size %d bytes at index %d\n", *actual_size, buffer_index); + #if CUDA_MEM_DEBUG + printf("INFO:: Found buffer of size %lld bytes at index %d\n", *actual_size, buffer_index); + cuda_pool_dump(); #endif return buffer_index; } - #ifdef CUDA_MEM_DEBUG - printf("INFO:: No buffer found for size %d bytes\n", size); + #if CUDA_MEM_DEBUG + printf("INFO:: No buffer found for size %lld bytes\n", size); + cuda_pool_dump(); #endif return -1; } + // mark all buffers as free void ggml_cuda_pool_initialize() { for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -262,8 +283,9 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { g_cuda_buffer_pool[buffer_index].ptr = nullptr; g_cuda_buffer_pool[buffer_index].size = 0; g_cuda_pool_total_allocated -= *actual_size; - #ifdef CUDA_MEM_DEBUG - printf("INFO:: Allocated %d bytes from buffer at index %d (total allocated: %d bytes)\n", *actual_size, buffer_index, g_cuda_pool_total_allocated); + #if CUDA_MEM_DEBUG + printf("INFO:: Allocated %lld bytes from buffer at index %d (total allocated: %lld bytes)\n", *actual_size, buffer_index, g_cuda_pool_total_allocated); + cuda_pool_dump(); #endif return ptr; } @@ -274,8 +296,9 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { CUDA_CHECK(cudaMalloc((void **) &ptr, size)); *actual_size = size; g_cuda_pool_total_allocated += size; - #ifdef CUDA_MEM_DEBUG - printf("INFO:: Allocated %d bytes from cudaMalloc (total allocated: %d bytes)\n", *actual_size, g_cuda_pool_total_allocated); + #if CUDA_MEM_DEBUG + printf("INFO:: Allocated %lld bytes from cudaMalloc (total allocated: %lld bytes)\n", *actual_size, g_cuda_pool_total_allocated); + cuda_pool_dump(); #endif return ptr; }