ggml : sync latest ggml_mul_mat_id

2023-12-09 11:19:46 +02:00 · 2023-12-09 11:19:46 +02:00 · 861cd67899
commit 861cd67899
parent a3eefe95a8
4 changed files with 114 additions and 75 deletions
--- a/ggml.c
+++ b/ggml.c
@ -4083,7 +4083,9 @@ struct ggml_tensor * ggml_mul_mat_id(
    int64_t n_as = ids->ne[0];

    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_vector(ids));
+    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+    GGML_ASSERT(ids->ne[1] == b->ne[1]);
+    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
    GGML_ASSERT(id >= 0 && id < n_as);

@ -9519,11 +9521,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif

+// off1 = offset in i11 and i1
+// cne1 = ne11 and ne1
+// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
+// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
 static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
+              struct ggml_tensor * dst,
+              int64_t off1, int64_t cne1) {
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

@ -9591,10 +9598,9 @@ static void ggml_compute_forward_mul_mat(
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;

-                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-
-                float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                const void  * x = (char *)            src0->data +             i02*nb02 + i03*nb03;
+                const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
+                      float * d = (float *) ((char *)  dst->data + off1*nb1  + i12*nb2  + i13*nb3);

                if (type != GGML_TYPE_F32) {
                            float * const wdata    = params->wdata;
@ -9611,10 +9617,10 @@ static void ggml_compute_forward_mul_mat(
                }

                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
+                         cne1, ne01, ne10,
+                         1.0f,    y, ne10,
+                                  x, ne00,
+                         0.0f,    d, ne01);
            }
        }

@ -9630,6 +9636,7 @@ static void ggml_compute_forward_mul_mat(
            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

            assert(params->wsize >= ne11*ne12*ne13*row_size);
+            assert(src1->type == GGML_TYPE_F32);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -9652,7 +9659,7 @@ static void ggml_compute_forward_mul_mat(
    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

    const int64_t nr0 = ne01;           // src0 rows
-    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
+    const int64_t nr1 = cne1*ne12*ne13; // src1 rows

    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);

@ -9694,9 +9701,9 @@ static void ggml_compute_forward_mul_mat(
    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                const int64_t i13 = (ir1/(ne12*ne11));
-                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
-                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+                const int64_t i13 = (ir1/(ne12*cne1));
+                const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+                const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;

                // broadcast src0 into src1
                const int64_t i03 = i13/r3;
@ -9736,20 +9743,26 @@ static void ggml_compute_forward_mul_mat(

 static void ggml_compute_forward_mul_mat_id(
        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {

-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
+        ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
+        return;
+    }

+    const struct ggml_tensor * ids = src0;
    const int id = ggml_get_op_params_i32(dst, 0);

-    const int a_id = ((int32_t *)ids->data)[id];
+    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+        const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+        GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);

-    GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
-
-    const struct ggml_tensor * src0 = dst->src[a_id + 2];
-
-    ggml_compute_forward_mul_mat(params, src0, src1, dst);
+        const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+        ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+    }
 }

 // ggml_compute_forward_out_prod
@ -14037,11 +14050,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_MUL_MAT:
            {
-                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
-                ggml_compute_forward_mul_mat_id(params, tensor);
+                ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_OUT_PROD:
            {