llama : Metal inference (#1642)

* mtl : export the LLaMA computation graph * ci : disable temporary * mtl : adapt the MNIST example as starter * mtl : no need for mtl-export tool, add cli arg for main instead * mtl : export just a small part of the graph for now to make it easier * mtl : move MSL code into separate file for easy editing * mtl : initial get_rows_q4_0 kernel * mtl : confirmed get_rows_q4_0 is working correctly * mtl : add rms_norm kernel + confirm working * mtl : add mul kernel + confirm working * mtl : initial mul_mat Q4 kernel (wrong results) * mtl : mul_mat fixes (still wrong) * mtl : another mul_mat Q4 (still does not work) * mtl : working mul_mat q4 * ggml : fix handling of "view" ops in ggml_graph_import() * mtl : add rope kernel * mtl : add reshape and transpose handling * ggml : store offset as opt arg for ggml_view_xd() operators * mtl : add cpy kernel + handle view ops * mtl : confirm f16 x f32 attention mul mat * mtl : add scale kernel * mtl : add diag_mask_inf kernel * mtl : fix soft_max kernel * ggml : update ggml_nbytes() to handle non-contiguous tensors * mtl : verify V tensor contents * mtl : add f32 -> f32 cpy kernel * mtl : add silu kernel * mtl : add non-broadcast mul kernel * mtl : full GPU inference of the computation graph * mtl : optimize rms_norm and soft_max kernels * mtl : add f16 mat x f32 vec multiplication kernel * mtl : fix bug in f16 x f32 mul mat + speed-up computation * mtl : faster mul_mat_q4_0_f32 kernel * mtl : fix kernel signature + roll inner loop * mtl : more threads for rms_norm + better timing * mtl : remove printfs from inner loop * mtl : simplify implementation * mtl : add save/load vocab to ggml file * mtl : plug Metal inference into llama.cpp (very quick-n-dirty) * mtl : make it work with main example Lots of hacks but at least now it generates text * mtl : preparing for merge * mtl : clean-up ggml mtl interface + suport scratch / inplace * mtl : remove temp / debug code * metal : final refactoring and simplification * Revert "ci : disable temporary" This reverts commit 98c267fc77. * metal : add comments * metal : clean-up stuff, fix typos * readme : add Metal instructions * readme : add example for main
2023-06-04 23:34:30 +03:00 · 2023-06-04 23:34:30 +03:00 · ecb217db4f
commit ecb217db4f
parent dcb2ed4826
17 changed files with 1677 additions and 94 deletions
--- a/ggml.c
+++ b/ggml.c
@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }

-int ggml_nrows(const struct ggml_tensor * tensor) {
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@ -3732,7 +3732,14 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

-    return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+    // this should handle cases where the tensor is not contiguous in memory
+    // probaby just:
+    //
+    //     return tensor->ne[3]*tensor->nb[3]
+    //
+    // is enough, but just in case, adding the second part
+
+    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
 }

 int ggml_blck_size(enum ggml_type type) {
@ -3814,11 +3821,11 @@ size_t ggml_tensor_overhead(void) {
    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
 }

-static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1];
 }

-static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
@ -5802,10 +5809,18 @@ struct ggml_tensor * ggml_view_1d(

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);

+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
    result->op   = GGML_OP_VIEW;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = NULL;
+    result->opt[0] = offs;

    if (is_node) {
        memcpy(result->padding, &offset, sizeof(offset));
@ -5834,6 +5849,13 @@ struct ggml_tensor * ggml_view_2d(

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);

+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
    result->nb[1] = nb1;
    result->nb[2] = result->nb[1]*ne1;
    result->nb[3] = result->nb[2];
@ -5842,6 +5864,7 @@ struct ggml_tensor * ggml_view_2d(
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = NULL;
+    result->opt[0] = offs;

    if (is_node) {
        memcpy(result->padding, &offset, sizeof(offset));
@ -5872,6 +5895,13 @@ struct ggml_tensor * ggml_view_3d(

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);

+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = result->nb[2]*ne2;
@ -5880,6 +5910,7 @@ struct ggml_tensor * ggml_view_3d(
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = NULL;
+    result->opt[0] = offs;

    if (is_node) {
        memcpy(result->padding, &offset, sizeof(offset));
@ -5912,6 +5943,13 @@ struct ggml_tensor * ggml_view_4d(

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);

+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = nb3;
@ -5920,6 +5958,7 @@ struct ggml_tensor * ggml_view_4d(
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = NULL;
+    result->opt[0] = offs;

    if (is_node) {
        memcpy(result->padding, &offset, sizeof(offset));
@ -9252,7 +9291,7 @@ static void ggml_compute_forward_rms_norm_f32(
                    sum += (ggml_float)(x[i00] * x[i00]);
                }

-                float mean = sum/ne00;
+                const float mean = sum/ne00;

                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);

@ -11163,7 +11202,7 @@ static void ggml_compute_forward_rope_f32(
                        theta *= theta_scale;

                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                        const float x0 = src[0];
                        const float x1 = src[1];
@ -11184,7 +11223,7 @@ static void ggml_compute_forward_rope_f32(
                            const int64_t i0 = ib*n_dims + ic/2;

                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
+                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                            const float x0 = src[0];
                            const float x1 = src[n_dims/2];
@ -14588,7 +14627,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
    const int64_t * ne = tensor->ne;
    const size_t  * nb = tensor->nb;

-    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
            tensor->n_dims,
@ -14602,7 +14641,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
    const int64_t * ne = tensor->ne;
    const size_t  * nb = tensor->nb;

-    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
            arg,
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
@ -14615,8 +14654,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }

 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    assert(cgraph->work      == NULL);
-    assert(cgraph->work_size == 0);
+    //assert(cgraph->work      == NULL);
+    //assert(cgraph->work_size == 0);

    uint64_t size_eval = 0;

@ -14837,7 +14876,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
    // read file into data
    {
        FILE * fin = fopen(fname, "rb");
-
        if (!fin) {
            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
            return result;
@ -14977,6 +15015,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);

+                enum ggml_op eop = (enum ggml_op) op;
+
                int64_t ne[GGML_MAX_DIMS];
                size_t  nb[GGML_MAX_DIMS];

@ -14991,42 +15031,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                    nb[j] = nb_cur;
                }

-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used

-                tensor->op = (enum ggml_op) op;
+                const char * ptr_name = ptr; ptr += GGML_MAX_NAME;

-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);

-                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+                struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+
+                // parse args
+                for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                    const int32_t arg_idx = ptr_arg_idx[j];
+
+                    if (arg_idx == -1) {
+                        continue;
+                    }
+
+                    if (arg_idx < GGML_MAX_NODES) {
+                        args[j] = result.leafs[arg_idx];
+                    } else {
+                        args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+                    }
+                }
+
+                // create the tensor
+                // "view" operations are handled differently
+                // TODO: handle inplace ops - currently a copy is always made
+
+                struct ggml_tensor * tensor = NULL;
+
+                switch (eop) {
+                    // TODO: implement other view ops
+                    case GGML_OP_RESHAPE:
+                        {
+                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
+                        } break;
+                    case GGML_OP_VIEW:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+
+                            uint64_t offs;
+                            memcpy(&offs, args[2]->data, sizeof(offs));
+
+                            tensor->data = ((char *) tensor->data) + offs;
+                        } break;
+                    case GGML_OP_TRANSPOSE:
+                        {
+                            tensor = ggml_transpose(*ctx_eval, args[0]);
+                        } break;
+                    case GGML_OP_PERMUTE:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+                        } break;
+                    default:
+                        {
+                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                            tensor->op = eop;
+                        } break;
+                }
+
+                memcpy(tensor->name, ptr_name, GGML_MAX_NAME);

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    tensor->nb[j] = nb[j];
                }

-                // parse args
-                {
-                    struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
-                        &tensor->src0,
-                        &tensor->src1,
-                    };
+                tensor->src0 = args[0];
+                tensor->src1 = args[1];

-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = &tensor->opt[j];
-                    }
-
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
-                        const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
-
-                        if (arg_idx == -1) {
-                            continue;
-                        }
-
-                        if (arg_idx < GGML_MAX_NODES) {
-                            *args[j] = result.leafs[arg_idx];
-                        } else {
-                            *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
-                        }
-                    }
+                for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                    tensor->opt[j] = args[2 + j];
                }

                result.nodes[i] = tensor;