Clblast fixes + enhancements to save VRAM:

1. Change all Clblast buffers to CL_MEM_READ_WRITE, as the pool malloc currently doesn't properly handle them.
2. When recycling buffers in pool malloc, always assign the SMALLEST available buffer that fits, instead of the FIRST available buffer
3. When failing to recycle a buffer in pool malloc (all too small), instead recycle the largest available free buffer by resizing it.
This commit is contained in:
Concedo 2023-06-02 22:10:49 +08:00
parent 24239f0df7
commit 59fe16877d

View file

@ -608,14 +608,35 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
scoped_spin_lock lock(g_cl_pool_lock); scoped_spin_lock lock(g_cl_pool_lock);
cl_int err; cl_int err;
int best_i = -1, best_size = (size_t)-1; //smallest unused buffer that fits our needs
int worst_i = -1, worst_size = 0; //largest unused buffer seen so far
for (int i = 0; i < MAX_CL_BUFFERS; ++i) { for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
cl_buffer &b = g_cl_buffer_pool[i]; cl_buffer &b = g_cl_buffer_pool[i];
if (b.size > 0 && b.size >= size) { if (b.size > 0 && b.size >= size && b.size < best_size)
{
best_i = i;
best_size = b.size;
}
if (b.size > 0 && b.size > worst_size)
{
worst_i = i;
worst_size = b.size;
}
}
if(best_i!=-1) //found the smallest buffer that fits our needs
{
cl_buffer& b = g_cl_buffer_pool[best_i];
cl_mem mem = b.mem; cl_mem mem = b.mem;
*actual_size = b.size; *actual_size = b.size;
b.size = 0; b.size = 0;
return mem; return mem;
} }
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
{
cl_buffer& b = g_cl_buffer_pool[worst_i];
cl_mem mem = b.mem;
b.size = 0;
clReleaseMemObject(mem);
} }
cl_mem mem; cl_mem mem;
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err)); CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
@ -692,9 +713,9 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t x_size; size_t x_size;
size_t d_size; size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0 cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_WRITE); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_READ_WRITE); // dst
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -792,10 +813,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) { if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->data;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE);
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -868,10 +889,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) { if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->data;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_WRITE);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_READ_WRITE);
bool src1_cont_rows = nb10 == sizeof(float); bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@ -972,11 +993,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
if (!mul_mat_vec) { if (!mul_mat_vec) {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE);
cl_mem d_Q; cl_mem d_Q;
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_CPU) {
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE);
} }
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
@ -1143,7 +1164,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size; size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE);
// copy tensor to device // copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {