mtl : export just a small part of the graph for now to make it easier
This commit is contained in:
parent
a792cbd0fc
commit
897d6d8e8f
2 changed files with 23 additions and 9 deletions
|
@ -117,15 +117,15 @@ struct ggml_mtl_context * llama_mtl_init(
|
|||
|
||||
ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
|
||||
ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
|
||||
fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
|
||||
fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
|
||||
|
||||
ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
|
||||
ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
|
||||
fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
|
||||
fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
|
||||
|
||||
ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
|
||||
ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
|
||||
fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
|
||||
fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
|
||||
}
|
||||
|
||||
// MTLBuffer approach
|
||||
|
@ -217,11 +217,11 @@ int llama_mtl_eval(
|
|||
|
||||
// copy the input data to the GPU
|
||||
{
|
||||
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
|
||||
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
||||
|
||||
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
|
||||
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
|
||||
|
||||
memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
|
||||
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
|
||||
}
|
||||
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
|
|
20
llama.cpp
20
llama.cpp
|
@ -1243,6 +1243,10 @@ static bool llama_eval_internal(
|
|||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
// TODO: TMP !!!
|
||||
ggml_cgraph gf_export = {};
|
||||
gf_export.n_threads = 1;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
|
@ -1264,6 +1268,11 @@ static bool llama_eval_internal(
|
|||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||
}
|
||||
|
||||
// TODO: TMP !!!!
|
||||
if (il == 0) {
|
||||
ggml_set_name(cur, "mtl-check");
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
|
@ -1420,12 +1429,17 @@ static bool llama_eval_internal(
|
|||
// logits -> probs
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// TODO: TMP !!!!!!!!!!!!!!!!!!!!
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
//ggml_build_forward_expand(&gf, inpL);
|
||||
//ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
// lets export a smaller graph to get things rolling -- baby steps first
|
||||
ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
|
||||
|
||||
if (cgraph_fname) {
|
||||
ggml_graph_export(&gf, cgraph_fname);
|
||||
//ggml_graph_export(&gf, cgraph_fname);
|
||||
ggml_graph_export(&gf_export, cgraph_fname);
|
||||
}
|
||||
|
||||
#ifdef GGML_PERF
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue