mtl : export just a small part of the graph for now to make it easier
This commit is contained in:
parent
a792cbd0fc
commit
897d6d8e8f
2 changed files with 23 additions and 9 deletions
|
@ -117,15 +117,15 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
|
|
||||||
ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
|
ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
|
||||||
ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
|
ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
|
||||||
fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
|
fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
|
||||||
|
|
||||||
ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
|
ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
|
||||||
ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
|
ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
|
||||||
fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
|
fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
|
||||||
|
|
||||||
ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
|
ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
|
||||||
ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
|
ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
|
||||||
fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
|
fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MTLBuffer approach
|
// MTLBuffer approach
|
||||||
|
@ -217,11 +217,11 @@ int llama_mtl_eval(
|
||||||
|
|
||||||
// copy the input data to the GPU
|
// copy the input data to the GPU
|
||||||
{
|
{
|
||||||
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
|
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
||||||
|
|
||||||
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
|
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
|
||||||
|
|
||||||
memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
|
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
|
|
20
llama.cpp
20
llama.cpp
|
@ -1243,6 +1243,10 @@ static bool llama_eval_internal(
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
|
|
||||||
|
// TODO: TMP !!!
|
||||||
|
ggml_cgraph gf_export = {};
|
||||||
|
gf_export.n_threads = 1;
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
ggml_set_name(embd, "embd");
|
ggml_set_name(embd, "embd");
|
||||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||||
|
@ -1264,6 +1268,11 @@ static bool llama_eval_internal(
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: TMP !!!!
|
||||||
|
if (il == 0) {
|
||||||
|
ggml_set_name(cur, "mtl-check");
|
||||||
|
}
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
|
@ -1420,12 +1429,17 @@ static bool llama_eval_internal(
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||||
|
|
||||||
|
// TODO: TMP !!!!!!!!!!!!!!!!!!!!
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
//ggml_build_forward_expand(&gf, inpL);
|
||||||
ggml_graph_compute (ctx0, &gf);
|
//ggml_graph_compute (ctx0, &gf);
|
||||||
|
|
||||||
|
// lets export a smaller graph to get things rolling -- baby steps first
|
||||||
|
ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
ggml_graph_export(&gf, cgraph_fname);
|
//ggml_graph_export(&gf, cgraph_fname);
|
||||||
|
ggml_graph_export(&gf_export, cgraph_fname);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue