add stub values for usage, revert cuda malloc pool implementation (+1 squashed commits)

Squashed commits:

[fd4cfb44] add stub values for usage, revert cuda malloc pool implementation
This commit is contained in:
Concedo 2024-01-05 19:02:45 +08:00
parent c9fdd42da2
commit 427ba21e62
2 changed files with 31 additions and 31 deletions

View file

@ -6592,43 +6592,44 @@ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) { static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock); scoped_spin_lock lock(g_cuda_pool_lock);
#ifdef DEBUG_CUDA_MALLOC
int nnz = 0; int best_i = -1;
size_t max_size = 0; size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
#endif int worst_i = -1;
size_t best_diff = 1ull << 36; size_t worst_size = 0; //largest unused buffer seen so far
int ibest = -1;
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i]; ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
if (b.ptr != nullptr) { if (b.size > 0 && b.size >= size && b.size < best_size)
#ifdef DEBUG_CUDA_MALLOC {
++nnz; best_i = i;
if (b.size > max_size) max_size = b.size; best_size = b.size;
#endif }
if (b.size >= size) { if (b.size > 0 && b.size > worst_size)
size_t diff = b.size - size; {
if (diff < best_diff) { worst_i = i;
best_diff = diff; worst_size = b.size;
ibest = i;
if (!best_diff) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
}
}
} }
} }
if (ibest >= 0) { if(best_i!=-1) //found the smallest buffer that fits our needs
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest]; {
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][best_i];
void * ptr = b.ptr; void * ptr = b.ptr;
*actual_size = b.size; *actual_size = b.size;
b.ptr = nullptr; b.ptr = nullptr;
b.size = 0; b.size = 0;
return ptr; return ptr;
} }
if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only)
{
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][worst_i];
b.size = 0;
void * ptr = b.ptr;
ggml_cuda_set_device(device);
cudaFree(ptr);
g_cuda_pool_size[device] -= size;
b.ptr = ptr = nullptr;
}
void * ptr; void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size); size_t look_ahead_size = (size_t) (1.05 * size);
@ -6637,10 +6638,7 @@ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size; *actual_size = look_ahead_size;
g_cuda_pool_size[device] += look_ahead_size; g_cuda_pool_size[device] += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
#endif
return ptr; return ptr;
} }

View file

@ -564,9 +564,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
res = {"data": {"seqs":[recvtxt]}} res = {"data": {"seqs":[recvtxt]}}
elif api_format==3: elif api_format==3:
res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": friendlymodelname, res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": friendlymodelname,
"usage": {"prompt_tokens": 100,"completion_tokens": 100,"total_tokens": 200},
"choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]} "choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]}
elif api_format==4: elif api_format==4:
res = {"id": "chatcmpl-1", "object": "chat.completion", "created": 1, "model": friendlymodelname, res = {"id": "chatcmpl-1", "object": "chat.completion", "created": 1, "model": friendlymodelname,
"usage": {"prompt_tokens": 100,"completion_tokens": 100,"total_tokens": 200},
"choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]} "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
else: else:
res = {"results": [{"text": recvtxt}]} res = {"results": [{"text": recvtxt}]}