ggml : multi-threaded get_rows
This commit is contained in:
parent
bc98eda9d5
commit
a97198747f
1 changed files with 61 additions and 40 deletions
101
ggml.c
101
ggml.c
|
@ -10744,8 +10744,6 @@ static void ggml_compute_forward_get_rows_q(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -10753,7 +10751,7 @@ static void ggml_compute_forward_get_rows_q(
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t nc = ne00;
|
const int64_t nc = ne00;
|
||||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
const int64_t nr = ggml_nelements(src1);
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
||||||
|
@ -10763,17 +10761,25 @@ static void ggml_compute_forward_get_rows_q(
|
||||||
assert(nb00 == ggml_type_size(type));
|
assert(nb00 == ggml_type_size(type));
|
||||||
assert(ggml_nrows(dst) == nr);
|
assert(ggml_nrows(dst) == nr);
|
||||||
|
|
||||||
// TODO: multi-thread
|
const int ith = params->ith;
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
const int nth = params->nth;
|
||||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
||||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
||||||
|
|
||||||
dequantize_row_q(
|
// rows per thread
|
||||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
const int dr = (nr + nth - 1)/nth;
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
||||||
}
|
// row range for this thread
|
||||||
}
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
for (int64_t i = ir0; i < ir1; ++i) {
|
||||||
|
const int64_t i12 = i/(ne11*ne10);
|
||||||
|
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
||||||
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
dequantize_row_q(
|
||||||
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||||
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10782,8 +10788,6 @@ static void ggml_compute_forward_get_rows_f16(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -10791,24 +10795,32 @@ static void ggml_compute_forward_get_rows_f16(
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t nc = ne00;
|
const int64_t nc = ne00;
|
||||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
const int64_t nr = ggml_nelements(src1);
|
||||||
|
|
||||||
assert(ne0 == nc);
|
assert(ne0 == nc);
|
||||||
assert(ne02 == ne11);
|
assert(ne02 == ne11);
|
||||||
assert(nb00 == sizeof(ggml_fp16_t));
|
assert(nb00 == sizeof(ggml_fp16_t));
|
||||||
assert(ggml_nrows(dst) == nr);
|
assert(ggml_nrows(dst) == nr);
|
||||||
|
|
||||||
// TODO: multi-thread
|
const int ith = params->ith;
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
const int nth = params->nth;
|
||||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
||||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
||||||
|
|
||||||
ggml_fp16_to_fp32_row(
|
// rows per thread
|
||||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
const int dr = (nr + nth - 1)/nth;
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
||||||
}
|
// row range for this thread
|
||||||
}
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
for (int64_t i = ir0; i < ir1; ++i) {
|
||||||
|
const int64_t i12 = i/(ne11*ne10);
|
||||||
|
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
||||||
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
ggml_fp16_to_fp32_row(
|
||||||
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||||
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10817,8 +10829,6 @@ static void ggml_compute_forward_get_rows_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -10826,24 +10836,32 @@ static void ggml_compute_forward_get_rows_f32(
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t nc = ne00;
|
const int64_t nc = ne00;
|
||||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
const int64_t nr = ggml_nelements(src1);
|
||||||
|
|
||||||
assert(ne0 == nc);
|
assert(ne0 == nc);
|
||||||
assert(ne02 == ne11);
|
assert(ne02 == ne11);
|
||||||
assert(nb00 == sizeof(float));
|
assert(nb00 == sizeof(float));
|
||||||
assert(ggml_nrows(dst) == nr);
|
assert(ggml_nrows(dst) == nr);
|
||||||
|
|
||||||
// TODO: multi-thread
|
const int ith = params->ith;
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
const int nth = params->nth;
|
||||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
||||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
||||||
|
|
||||||
ggml_vec_cpy_f32(nc,
|
// rows per thread
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
const int dr = (nr + nth - 1)/nth;
|
||||||
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
||||||
}
|
// row range for this thread
|
||||||
}
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
for (int64_t i = ir0; i < ir1; ++i) {
|
||||||
|
const int64_t i12 = i/(ne11*ne10);
|
||||||
|
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
||||||
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
ggml_vec_cpy_f32(nc,
|
||||||
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
||||||
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16374,6 +16392,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
{
|
||||||
|
n_tasks = n_threads;
|
||||||
|
} break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_SET:
|
case GGML_OP_SET:
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
@ -16381,7 +16403,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
case GGML_OP_GET_ROWS:
|
|
||||||
case GGML_OP_GET_ROWS_BACK:
|
case GGML_OP_GET_ROWS_BACK:
|
||||||
case GGML_OP_DIAG:
|
case GGML_OP_DIAG:
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue