mtl : full GPU inference of the computation graph
This commit is contained in:
parent
fbd3f6258d
commit
9665429e94
2 changed files with 112 additions and 128 deletions
|
@ -782,49 +782,49 @@ int llama_mtl_eval(
|
|||
// TODO
|
||||
const float * logits = ctx->out.contents;
|
||||
|
||||
{
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
|
||||
if (t->type == GGML_TYPE_F32) {
|
||||
const const float * data = (float *) ctx->out.contents;
|
||||
printf("data: ");
|
||||
for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
printf("%f ", data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < ggml_nelements(t); i++) {
|
||||
double cur = data[i];
|
||||
if (isinf(cur)) continue;
|
||||
sum += cur;
|
||||
}
|
||||
printf("sum: %f\n", sum);
|
||||
} else if (t->type == GGML_TYPE_F16) {
|
||||
ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
|
||||
printf("data: ");
|
||||
for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
printf("%f ", ggml_fp16_to_fp32(data[i]));
|
||||
}
|
||||
printf("\n");
|
||||
double sum = 0.0;
|
||||
printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
|
||||
const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
||||
const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
|
||||
const float curf = ggml_fp16_to_fp32(cur);
|
||||
if (isinf(curf)) continue;
|
||||
sum += curf;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("sum: %f\n", sum);
|
||||
} else {
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
}
|
||||
//{
|
||||
// struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
|
||||
// if (t->type == GGML_TYPE_F32) {
|
||||
// const const float * data = (float *) ctx->out.contents;
|
||||
// printf("data: ");
|
||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
// printf("%f ", data[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// double sum = 0.0;
|
||||
// for (int i = 0; i < ggml_nelements(t); i++) {
|
||||
// double cur = data[i];
|
||||
// if (isinf(cur)) continue;
|
||||
// sum += cur;
|
||||
// }
|
||||
// printf("sum: %f\n", sum);
|
||||
// } else if (t->type == GGML_TYPE_F16) {
|
||||
// ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
|
||||
// printf("data: ");
|
||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
// printf("%f ", ggml_fp16_to_fp32(data[i]));
|
||||
// }
|
||||
// printf("\n");
|
||||
// double sum = 0.0;
|
||||
// printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
||||
// for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
|
||||
// for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
|
||||
// for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
|
||||
// for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
|
||||
// const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
||||
// const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
|
||||
// const float curf = ggml_fp16_to_fp32(cur);
|
||||
// if (isinf(curf)) continue;
|
||||
// sum += curf;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// printf("sum: %f\n", sum);
|
||||
// } else {
|
||||
// GGML_ASSERT(false && "not implemented");
|
||||
// }
|
||||
//}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
154
llama.cpp
154
llama.cpp
|
@ -1243,10 +1243,6 @@ static bool llama_eval_internal(
|
|||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
// TODO: TMP !!!
|
||||
ggml_cgraph gf_export = {};
|
||||
gf_export.n_threads = 1;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
|
@ -1299,12 +1295,6 @@ static bool llama_eval_internal(
|
|||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
//ggml_build_forward_expand(&gf, t);
|
||||
|
||||
// TODO: TMP !!!!!!!!!!
|
||||
if (il == 0) {
|
||||
ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
|
@ -1404,11 +1394,6 @@ static bool llama_eval_internal(
|
|||
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
|
||||
// TODO: TMP !!!!
|
||||
if (il == 0) {
|
||||
ggml_set_name(cur, "mtl-check");
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].w2,
|
||||
cur);
|
||||
|
@ -1444,84 +1429,83 @@ static bool llama_eval_internal(
|
|||
// logits -> probs
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// TODO: TMP !!!!!!!!!!!!!!!!!!!!
|
||||
// run the computation
|
||||
//ggml_build_forward_expand(&gf, inpL);
|
||||
//ggml_graph_compute (ctx0, &gf);
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
// lets export a smaller graph to get things rolling -- baby steps first
|
||||
{
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
|
||||
if (!t) {
|
||||
fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
ggml_build_forward_expand(&gf_export, t);
|
||||
}
|
||||
// TODO: not needed anymore, keeping for a bit
|
||||
//// lets export a smaller graph to get things rolling -- baby steps first
|
||||
//{
|
||||
// struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
|
||||
// if (!t) {
|
||||
// fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
|
||||
// exit(1);
|
||||
// }
|
||||
// ggml_build_forward_expand(&gf, t);
|
||||
//}
|
||||
|
||||
// print
|
||||
{
|
||||
auto print_t_f32 = [&](struct ggml_tensor * t) {
|
||||
float * data = (float *)t->data;
|
||||
printf("data: ");
|
||||
for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
printf("%f ", data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < ggml_nelements(t); i++) {
|
||||
double cur = data[i];
|
||||
if (isinf(cur)) continue;
|
||||
sum += data[i];
|
||||
}
|
||||
printf("sum: %f\n", sum);
|
||||
};
|
||||
auto print_t_f16 = [&](struct ggml_tensor * t) {
|
||||
ggml_fp16_t * data = (ggml_fp16_t *)t->data;
|
||||
printf("data: ");
|
||||
for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
printf("%f ", ggml_fp16_to_fp32(data[i]));
|
||||
}
|
||||
printf("\n");
|
||||
double sum = 0.0;
|
||||
printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
|
||||
const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
||||
const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
|
||||
const float curf = ggml_fp16_to_fp32(cur);
|
||||
if (isinf(curf)) continue;
|
||||
sum += curf;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("sum: %f\n", sum);
|
||||
};
|
||||
//{
|
||||
// auto print_t_f32 = [&](struct ggml_tensor * t) {
|
||||
// float * data = (float *)t->data;
|
||||
// printf("data: ");
|
||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
// printf("%f ", data[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// double sum = 0.0;
|
||||
// for (int i = 0; i < ggml_nelements(t); i++) {
|
||||
// double cur = data[i];
|
||||
// if (isinf(cur)) continue;
|
||||
// sum += data[i];
|
||||
// }
|
||||
// printf("sum: %f\n", sum);
|
||||
// };
|
||||
// auto print_t_f16 = [&](struct ggml_tensor * t) {
|
||||
// ggml_fp16_t * data = (ggml_fp16_t *)t->data;
|
||||
// printf("data: ");
|
||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
||||
// printf("%f ", ggml_fp16_to_fp32(data[i]));
|
||||
// }
|
||||
// printf("\n");
|
||||
// double sum = 0.0;
|
||||
// printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
||||
// for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
|
||||
// for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
|
||||
// for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
|
||||
// for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
|
||||
// const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
||||
// const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
|
||||
// const float curf = ggml_fp16_to_fp32(cur);
|
||||
// if (isinf(curf)) continue;
|
||||
// sum += curf;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// printf("sum: %f\n", sum);
|
||||
// };
|
||||
|
||||
ggml_graph_compute(ctx0, &gf_export);
|
||||
// ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
auto * t = ggml_get_tensor(ctx0, "mtl-check");
|
||||
switch (t->type) {
|
||||
case GGML_TYPE_F32:
|
||||
print_t_f32(t);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
print_t_f16(t);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: unsupported type\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// {
|
||||
// auto * t = ggml_get_tensor(ctx0, "mtl-check");
|
||||
// switch (t->type) {
|
||||
// case GGML_TYPE_F32:
|
||||
// print_t_f32(t);
|
||||
// break;
|
||||
// case GGML_TYPE_F16:
|
||||
// print_t_f16(t);
|
||||
// break;
|
||||
// default:
|
||||
// fprintf(stderr, "%s: unsupported type\n", __func__);
|
||||
// exit(1);
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
if (cgraph_fname) {
|
||||
//ggml_graph_export(&gf, cgraph_fname);
|
||||
ggml_graph_export(&gf_export, cgraph_fname);
|
||||
ggml_graph_export(&gf, cgraph_fname);
|
||||
}
|
||||
|
||||
#ifdef GGML_PERF
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue