diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m index 58f1f0371..47bbdb4ad 100644 --- a/examples/mtl/mtl.m +++ b/examples/mtl/mtl.m @@ -117,15 +117,15 @@ struct ggml_mtl_context * llama_mtl_init( ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; - fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add); + fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add); ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; - fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu); + fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu); ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil]; ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; - fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max); + fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max); } // MTLBuffer approach @@ -217,11 +217,11 @@ int llama_mtl_eval( // copy the input data to the GPU { - struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input"); + struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); - id id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0); + id id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0); - memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp)); + memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd)); } for (int i = 0; i < gf->n_nodes; ++i) { diff --git a/llama.cpp b/llama.cpp index 9dccf0ed1..e6d544615 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1243,6 +1243,10 @@ static bool llama_eval_internal( ggml_cgraph gf = {}; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + // TODO: TMP !!! + ggml_cgraph gf_export = {}; + gf_export.n_threads = 1; + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); @@ -1264,6 +1268,11 @@ static bool llama_eval_internal( cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); } + // TODO: TMP !!!! + if (il == 0) { + ggml_set_name(cur, "mtl-check"); + } + // self-attention { // compute Q and K and RoPE them @@ -1420,12 +1429,17 @@ static bool llama_eval_internal( // logits -> probs //inpL = ggml_soft_max_inplace(ctx0, inpL); + // TODO: TMP !!!!!!!!!!!!!!!!!!!! // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + //ggml_build_forward_expand(&gf, inpL); + //ggml_graph_compute (ctx0, &gf); + + // lets export a smaller graph to get things rolling -- baby steps first + ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check")); if (cgraph_fname) { - ggml_graph_export(&gf, cgraph_fname); + //ggml_graph_export(&gf, cgraph_fname); + ggml_graph_export(&gf_export, cgraph_fname); } #ifdef GGML_PERF