switch to upstream implementation of pool malloc

This commit is contained in:
Concedo 2023-08-07 15:16:37 +08:00
parent 6659652c9f
commit 9f16a4c4ef

View file

@ -4293,45 +4293,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock); scoped_spin_lock lock(g_cuda_pool_lock);
int id; int id;
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
#ifdef DEBUG_CUDA_MALLOC
int best_i = -1; int nnz = 0;
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs size_t max_size = 0, tot_size = 0;
int worst_i = -1; #endif
size_t worst_size = 0; //largest unused buffer seen so far size_t best_diff = 1ull << 36;
int ibest = -1;
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[id][i]; cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.size > 0 && b.size >= size && b.size < best_size) if (b.ptr != nullptr) {
{ #ifdef DEBUG_CUDA_MALLOC
best_i = i; ++nnz;
best_size = b.size; tot_size += b.size;
} if (b.size > max_size) max_size = b.size;
if (b.size > 0 && b.size > worst_size) #endif
{ if (b.size >= size) {
worst_i = i; size_t diff = b.size - size;
worst_size = b.size; if (diff < best_diff) {
best_diff = diff;
ibest = i;
if (!best_diff) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
}
}
} }
} }
if(best_i!=-1) //found the smallest buffer that fits our needs if (ibest >= 0) {
{ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
void * ptr = b.ptr; void * ptr = b.ptr;
*actual_size = b.size; *actual_size = b.size;
b.ptr = nullptr; b.ptr = nullptr;
b.size = 0; b.size = 0;
return ptr; return ptr;
} }
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory #ifdef DEBUG_CUDA_MALLOC
{ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
cuda_buffer& b = g_cuda_buffer_pool[id][worst_i]; (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
b.size = 0; #endif
void * ptr = b.ptr;
cudaFree(ptr);
b.ptr = ptr = nullptr;
}
void * ptr; void * ptr;
CUDA_CHECK(cudaMalloc((void **) &ptr, size)); size_t look_ahead_size = (size_t) (1.05 * size);
*actual_size = size; look_ahead_size = 256 * ((look_ahead_size + 255)/256);
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size;
return ptr; return ptr;
} }