mtl : export just a small part of the graph for now to make it easier

2023-05-29 21:40:05 +03:00 · 2023-05-29 21:40:05 +03:00 · 897d6d8e8f
commit 897d6d8e8f
parent a792cbd0fc
2 changed files with 23 additions and 9 deletions
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@ -117,15 +117,15 @@ struct ggml_mtl_context * llama_mtl_init(

        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);

        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);

        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
    }

    // MTLBuffer approach
@ -217,11 +217,11 @@ int llama_mtl_eval(

    // copy the input data to the GPU
    {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");

-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
+        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);

-        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
    }

    for (int i = 0; i < gf->n_nodes; ++i) {
--- a/llama.cpp
+++ b/llama.cpp
@ -1243,6 +1243,10 @@ static bool llama_eval_internal(
    ggml_cgraph gf = {};
    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

+    // TODO: TMP !!!
+    ggml_cgraph gf_export = {};
+    gf_export.n_threads = 1;
+
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    ggml_set_name(embd, "embd");
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -1264,6 +1268,11 @@ static bool llama_eval_internal(
            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
        }

+        // TODO: TMP !!!!
+        if (il == 0) {
+            ggml_set_name(cur, "mtl-check");
+        }
+
        // self-attention
        {
            // compute Q and K and RoPE them
@ -1420,12 +1429,17 @@ static bool llama_eval_internal(
    // logits -> probs
    //inpL = ggml_soft_max_inplace(ctx0, inpL);

+    // TODO: TMP !!!!!!!!!!!!!!!!!!!!
    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    //ggml_build_forward_expand(&gf, inpL);
+    //ggml_graph_compute       (ctx0, &gf);
+
+    // lets export a smaller graph to get things rolling -- baby steps first
+    ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));

    if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        //ggml_graph_export(&gf, cgraph_fname);
+        ggml_graph_export(&gf_export, cgraph_fname);
    }

 #ifdef GGML_PERF