removed flags from the CL pool malloc, apply code tidying suggestions.

This commit is contained in:
Concedo 2023-06-05 13:31:37 +08:00
parent 64e3e74556
commit f6431ded5d

View file

@ -605,12 +605,14 @@ struct cl_buffer {
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cl_pool_lock); scoped_spin_lock lock(g_cl_pool_lock);
cl_int err; cl_int err;
int best_i = -1, best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs int best_i = -1;
int worst_i = -1, worst_size = 0; //largest unused buffer seen so far size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
int worst_i = -1;
size_t worst_size = 0; //largest unused buffer seen so far
for (int i = 0; i < MAX_CL_BUFFERS; ++i) { for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
cl_buffer &b = g_cl_buffer_pool[i]; cl_buffer &b = g_cl_buffer_pool[i];
if (b.size > 0 && b.size >= size && b.size < best_size) if (b.size > 0 && b.size >= size && b.size < best_size)
@ -640,7 +642,7 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
clReleaseMemObject(mem); clReleaseMemObject(mem);
} }
cl_mem mem; cl_mem mem;
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err)); CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
*actual_size = size; *actual_size = size;
return mem; return mem;
} }
@ -714,9 +716,9 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t x_size; size_t x_size;
size_t d_size; size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_WRITE); // src0 cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_READ_WRITE); // dst cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -815,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) { if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->data;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -891,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) { if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->data;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_WRITE); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_READ_WRITE); cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
bool src1_cont_rows = nb10 == sizeof(float); bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@ -993,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
size_t q_size; size_t q_size;
cl_mem d_X; cl_mem d_X;
if (!mul_mat_vec) { if (!mul_mat_vec) {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Q; cl_mem d_Q;
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_CPU) {
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE); d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
} }
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
@ -1166,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size; size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE); cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
// copy tensor to device // copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {