Merge branch 'opencl-dev-concedo' into concedo_experimental
This commit is contained in:
commit
96b0e536b7
1 changed files with 39 additions and 18 deletions
|
@ -616,14 +616,35 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
|
||||||
scoped_spin_lock lock(g_cl_pool_lock);
|
scoped_spin_lock lock(g_cl_pool_lock);
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
|
int best_i = -1, best_size = (size_t)-1; //smallest unused buffer that fits our needs
|
||||||
|
int worst_i = -1, worst_size = 0; //largest unused buffer seen so far
|
||||||
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
||||||
cl_buffer& b = g_cl_buffer_pool[i];
|
cl_buffer &b = g_cl_buffer_pool[i];
|
||||||
if (b.size > 0 && b.size >= size) {
|
if (b.size > 0 && b.size >= size && b.size < best_size)
|
||||||
cl_mem mem = b.mem;
|
{
|
||||||
*actual_size = b.size;
|
best_i = i;
|
||||||
b.size = 0;
|
best_size = b.size;
|
||||||
return mem;
|
|
||||||
}
|
}
|
||||||
|
if (b.size > 0 && b.size > worst_size)
|
||||||
|
{
|
||||||
|
worst_i = i;
|
||||||
|
worst_size = b.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(best_i!=-1) //found the smallest buffer that fits our needs
|
||||||
|
{
|
||||||
|
cl_buffer& b = g_cl_buffer_pool[best_i];
|
||||||
|
cl_mem mem = b.mem;
|
||||||
|
*actual_size = b.size;
|
||||||
|
b.size = 0;
|
||||||
|
return mem;
|
||||||
|
}
|
||||||
|
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
||||||
|
{
|
||||||
|
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
||||||
|
cl_mem mem = b.mem;
|
||||||
|
b.size = 0;
|
||||||
|
clReleaseMemObject(mem);
|
||||||
}
|
}
|
||||||
cl_mem mem;
|
cl_mem mem;
|
||||||
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
|
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
|
||||||
|
@ -700,9 +721,9 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||||
size_t x_size;
|
size_t x_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
|
|
||||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_WRITE); // src0
|
||||||
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_READ_WRITE); // dst
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
@ -800,10 +821,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_CL) {
|
||||||
d_X = (cl_mem) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||||
}
|
}
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE);
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
@ -877,10 +898,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_CL) {
|
||||||
d_X = (cl_mem) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||||
}
|
}
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_WRITE);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_READ_WRITE);
|
||||||
|
|
||||||
bool src1_cont_rows = nb10 == sizeof(float);
|
bool src1_cont_rows = nb10 == sizeof(float);
|
||||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||||
|
@ -982,11 +1003,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (!mul_mat_vec) {
|
if (!mul_mat_vec) {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||||
}
|
}
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_WRITE);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_READ_WRITE);
|
||||||
cl_mem d_Q;
|
cl_mem d_Q;
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_CPU) {
|
||||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE);
|
||||||
}
|
}
|
||||||
|
|
||||||
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
||||||
|
@ -1154,7 +1175,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||||
|
|
||||||
size_t q_size;
|
size_t q_size;
|
||||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_WRITE);
|
||||||
|
|
||||||
// copy tensor to device
|
// copy tensor to device
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue