diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m index 2de105640..85003ebdd 100644 --- a/examples/mtl/mtl.m +++ b/examples/mtl/mtl.m @@ -782,49 +782,49 @@ int llama_mtl_eval( // TODO const float * logits = ctx->out.contents; - { - struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check"); - if (t->type == GGML_TYPE_F32) { - const const float * data = (float *) ctx->out.contents; - printf("data: "); - for (int i = 0; i < (int) t->ne[0]; i++) { - printf("%f ", data[i]); - } - printf("\n"); - double sum = 0.0; - for (int i = 0; i < ggml_nelements(t); i++) { - double cur = data[i]; - if (isinf(cur)) continue; - sum += cur; - } - printf("sum: %f\n", sum); - } else if (t->type == GGML_TYPE_F16) { - ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents; - printf("data: "); - for (int i = 0; i < (int) t->ne[0]; i++) { - printf("%f ", ggml_fp16_to_fp32(data[i])); - } - printf("\n"); - double sum = 0.0; - printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); - for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { - for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { - for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { - for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { - const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; - const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); - const float curf = ggml_fp16_to_fp32(cur); - if (isinf(curf)) continue; - sum += curf; - } - } - } - } - printf("sum: %f\n", sum); - } else { - GGML_ASSERT(false && "not implemented"); - } - } + //{ + // struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check"); + // if (t->type == GGML_TYPE_F32) { + // const const float * data = (float *) ctx->out.contents; + // printf("data: "); + // for (int i = 0; i < (int) t->ne[0]; i++) { + // printf("%f ", data[i]); + // } + // printf("\n"); + // double sum = 0.0; + // for (int i = 0; i < ggml_nelements(t); i++) { + // double cur = data[i]; + // if (isinf(cur)) continue; + // sum += cur; + // } + // printf("sum: %f\n", sum); + // } else if (t->type == GGML_TYPE_F16) { + // ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents; + // printf("data: "); + // for (int i = 0; i < (int) t->ne[0]; i++) { + // printf("%f ", ggml_fp16_to_fp32(data[i])); + // } + // printf("\n"); + // double sum = 0.0; + // printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); + // for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { + // for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { + // for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { + // for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { + // const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; + // const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); + // const float curf = ggml_fp16_to_fp32(cur); + // if (isinf(curf)) continue; + // sum += curf; + // } + // } + // } + // } + // printf("sum: %f\n", sum); + // } else { + // GGML_ASSERT(false && "not implemented"); + // } + //} return 0; } diff --git a/llama.cpp b/llama.cpp index 81d998c18..e0fbc6f73 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1243,10 +1243,6 @@ static bool llama_eval_internal( ggml_cgraph gf = {}; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - // TODO: TMP !!! - ggml_cgraph gf_export = {}; - gf_export.n_threads = 1; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); @@ -1299,12 +1295,6 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); //ggml_build_forward_expand(&gf, t); - - // TODO: TMP !!!!!!!!!! - if (il == 0) { - ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v)); - } } struct ggml_tensor * Q = @@ -1404,11 +1394,6 @@ static bool llama_eval_internal( cur = ggml_mul(ctx0, cur, tmp); - // TODO: TMP !!!! - if (il == 0) { - ggml_set_name(cur, "mtl-check"); - } - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); @@ -1444,84 +1429,83 @@ static bool llama_eval_internal( // logits -> probs //inpL = ggml_soft_max_inplace(ctx0, inpL); - // TODO: TMP !!!!!!!!!!!!!!!!!!!! // run the computation - //ggml_build_forward_expand(&gf, inpL); - //ggml_graph_compute (ctx0, &gf); + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute (ctx0, &gf); - // lets export a smaller graph to get things rolling -- baby steps first - { - struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check"); - if (!t) { - fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__); - exit(1); - } - ggml_build_forward_expand(&gf_export, t); - } + // TODO: not needed anymore, keeping for a bit + //// lets export a smaller graph to get things rolling -- baby steps first + //{ + // struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check"); + // if (!t) { + // fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__); + // exit(1); + // } + // ggml_build_forward_expand(&gf, t); + //} // print - { - auto print_t_f32 = [&](struct ggml_tensor * t) { - float * data = (float *)t->data; - printf("data: "); - for (int i = 0; i < (int) t->ne[0]; i++) { - printf("%f ", data[i]); - } - printf("\n"); - double sum = 0.0; - for (int i = 0; i < ggml_nelements(t); i++) { - double cur = data[i]; - if (isinf(cur)) continue; - sum += data[i]; - } - printf("sum: %f\n", sum); - }; - auto print_t_f16 = [&](struct ggml_tensor * t) { - ggml_fp16_t * data = (ggml_fp16_t *)t->data; - printf("data: "); - for (int i = 0; i < (int) t->ne[0]; i++) { - printf("%f ", ggml_fp16_to_fp32(data[i])); - } - printf("\n"); - double sum = 0.0; - printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); - for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { - for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { - for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { - for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { - const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; - const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); - const float curf = ggml_fp16_to_fp32(cur); - if (isinf(curf)) continue; - sum += curf; - } - } - } - } - printf("sum: %f\n", sum); - }; + //{ + // auto print_t_f32 = [&](struct ggml_tensor * t) { + // float * data = (float *)t->data; + // printf("data: "); + // for (int i = 0; i < (int) t->ne[0]; i++) { + // printf("%f ", data[i]); + // } + // printf("\n"); + // double sum = 0.0; + // for (int i = 0; i < ggml_nelements(t); i++) { + // double cur = data[i]; + // if (isinf(cur)) continue; + // sum += data[i]; + // } + // printf("sum: %f\n", sum); + // }; + // auto print_t_f16 = [&](struct ggml_tensor * t) { + // ggml_fp16_t * data = (ggml_fp16_t *)t->data; + // printf("data: "); + // for (int i = 0; i < (int) t->ne[0]; i++) { + // printf("%f ", ggml_fp16_to_fp32(data[i])); + // } + // printf("\n"); + // double sum = 0.0; + // printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); + // for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { + // for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { + // for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { + // for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { + // const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; + // const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); + // const float curf = ggml_fp16_to_fp32(cur); + // if (isinf(curf)) continue; + // sum += curf; + // } + // } + // } + // } + // printf("sum: %f\n", sum); + // }; - ggml_graph_compute(ctx0, &gf_export); + // ggml_graph_compute(ctx0, &gf); - { - auto * t = ggml_get_tensor(ctx0, "mtl-check"); - switch (t->type) { - case GGML_TYPE_F32: - print_t_f32(t); - break; - case GGML_TYPE_F16: - print_t_f16(t); - break; - default: - fprintf(stderr, "%s: unsupported type\n", __func__); - exit(1); - } - } - } + // { + // auto * t = ggml_get_tensor(ctx0, "mtl-check"); + // switch (t->type) { + // case GGML_TYPE_F32: + // print_t_f32(t); + // break; + // case GGML_TYPE_F16: + // print_t_f16(t); + // break; + // default: + // fprintf(stderr, "%s: unsupported type\n", __func__); + // exit(1); + // } + // } + //} if (cgraph_fname) { - //ggml_graph_export(&gf, cgraph_fname); - ggml_graph_export(&gf_export, cgraph_fname); + ggml_graph_export(&gf, cgraph_fname); } #ifdef GGML_PERF