mtl : full GPU inference of the computation graph

This commit is contained in:
Georgi Gerganov 2023-06-01 21:50:01 +03:00
parent fbd3f6258d
commit 9665429e94
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 112 additions and 128 deletions

View file

@ -782,49 +782,49 @@ int llama_mtl_eval(
// TODO // TODO
const float * logits = ctx->out.contents; const float * logits = ctx->out.contents;
{ //{
struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check"); // struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
if (t->type == GGML_TYPE_F32) { // if (t->type == GGML_TYPE_F32) {
const const float * data = (float *) ctx->out.contents; // const const float * data = (float *) ctx->out.contents;
printf("data: "); // printf("data: ");
for (int i = 0; i < (int) t->ne[0]; i++) { // for (int i = 0; i < (int) t->ne[0]; i++) {
printf("%f ", data[i]); // printf("%f ", data[i]);
} // }
printf("\n"); // printf("\n");
double sum = 0.0; // double sum = 0.0;
for (int i = 0; i < ggml_nelements(t); i++) { // for (int i = 0; i < ggml_nelements(t); i++) {
double cur = data[i]; // double cur = data[i];
if (isinf(cur)) continue; // if (isinf(cur)) continue;
sum += cur; // sum += cur;
} // }
printf("sum: %f\n", sum); // printf("sum: %f\n", sum);
} else if (t->type == GGML_TYPE_F16) { // } else if (t->type == GGML_TYPE_F16) {
ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents; // ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
printf("data: "); // printf("data: ");
for (int i = 0; i < (int) t->ne[0]; i++) { // for (int i = 0; i < (int) t->ne[0]; i++) {
printf("%f ", ggml_fp16_to_fp32(data[i])); // printf("%f ", ggml_fp16_to_fp32(data[i]));
} // }
printf("\n"); // printf("\n");
double sum = 0.0; // double sum = 0.0;
printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); // printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { // for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { // for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { // for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { // for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; // const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); // const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
const float curf = ggml_fp16_to_fp32(cur); // const float curf = ggml_fp16_to_fp32(cur);
if (isinf(curf)) continue; // if (isinf(curf)) continue;
sum += curf; // sum += curf;
} // }
} // }
} // }
} // }
printf("sum: %f\n", sum); // printf("sum: %f\n", sum);
} else { // } else {
GGML_ASSERT(false && "not implemented"); // GGML_ASSERT(false && "not implemented");
} // }
} //}
return 0; return 0;
} }

154
llama.cpp
View file

@ -1243,10 +1243,6 @@ static bool llama_eval_internal(
ggml_cgraph gf = {}; ggml_cgraph gf = {};
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
// TODO: TMP !!!
ggml_cgraph gf_export = {};
gf_export.n_threads = 1;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd"); ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd)); memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -1299,12 +1295,6 @@ static bool llama_eval_internal(
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
//ggml_build_forward_expand(&gf, t); //ggml_build_forward_expand(&gf, t);
// TODO: TMP !!!!!!!!!!
if (il == 0) {
ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
}
} }
struct ggml_tensor * Q = struct ggml_tensor * Q =
@ -1404,11 +1394,6 @@ static bool llama_eval_internal(
cur = ggml_mul(ctx0, cur, tmp); cur = ggml_mul(ctx0, cur, tmp);
// TODO: TMP !!!!
if (il == 0) {
ggml_set_name(cur, "mtl-check");
}
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
model.layers[il].w2, model.layers[il].w2,
cur); cur);
@ -1444,84 +1429,83 @@ static bool llama_eval_internal(
// logits -> probs // logits -> probs
//inpL = ggml_soft_max_inplace(ctx0, inpL); //inpL = ggml_soft_max_inplace(ctx0, inpL);
// TODO: TMP !!!!!!!!!!!!!!!!!!!!
// run the computation // run the computation
//ggml_build_forward_expand(&gf, inpL); ggml_build_forward_expand(&gf, inpL);
//ggml_graph_compute (ctx0, &gf); ggml_graph_compute (ctx0, &gf);
// lets export a smaller graph to get things rolling -- baby steps first // TODO: not needed anymore, keeping for a bit
{ //// lets export a smaller graph to get things rolling -- baby steps first
struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check"); //{
if (!t) { // struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__); // if (!t) {
exit(1); // fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
} // exit(1);
ggml_build_forward_expand(&gf_export, t); // }
} // ggml_build_forward_expand(&gf, t);
//}
// print // print
{ //{
auto print_t_f32 = [&](struct ggml_tensor * t) { // auto print_t_f32 = [&](struct ggml_tensor * t) {
float * data = (float *)t->data; // float * data = (float *)t->data;
printf("data: "); // printf("data: ");
for (int i = 0; i < (int) t->ne[0]; i++) { // for (int i = 0; i < (int) t->ne[0]; i++) {
printf("%f ", data[i]); // printf("%f ", data[i]);
} // }
printf("\n"); // printf("\n");
double sum = 0.0; // double sum = 0.0;
for (int i = 0; i < ggml_nelements(t); i++) { // for (int i = 0; i < ggml_nelements(t); i++) {
double cur = data[i]; // double cur = data[i];
if (isinf(cur)) continue; // if (isinf(cur)) continue;
sum += data[i]; // sum += data[i];
} // }
printf("sum: %f\n", sum); // printf("sum: %f\n", sum);
}; // };
auto print_t_f16 = [&](struct ggml_tensor * t) { // auto print_t_f16 = [&](struct ggml_tensor * t) {
ggml_fp16_t * data = (ggml_fp16_t *)t->data; // ggml_fp16_t * data = (ggml_fp16_t *)t->data;
printf("data: "); // printf("data: ");
for (int i = 0; i < (int) t->ne[0]; i++) { // for (int i = 0; i < (int) t->ne[0]; i++) {
printf("%f ", ggml_fp16_to_fp32(data[i])); // printf("%f ", ggml_fp16_to_fp32(data[i]));
} // }
printf("\n"); // printf("\n");
double sum = 0.0; // double sum = 0.0;
printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); // printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { // for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { // for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { // for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { // for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; // const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); // const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
const float curf = ggml_fp16_to_fp32(cur); // const float curf = ggml_fp16_to_fp32(cur);
if (isinf(curf)) continue; // if (isinf(curf)) continue;
sum += curf; // sum += curf;
} // }
} // }
} // }
} // }
printf("sum: %f\n", sum); // printf("sum: %f\n", sum);
}; // };
ggml_graph_compute(ctx0, &gf_export); // ggml_graph_compute(ctx0, &gf);
{ // {
auto * t = ggml_get_tensor(ctx0, "mtl-check"); // auto * t = ggml_get_tensor(ctx0, "mtl-check");
switch (t->type) { // switch (t->type) {
case GGML_TYPE_F32: // case GGML_TYPE_F32:
print_t_f32(t); // print_t_f32(t);
break; // break;
case GGML_TYPE_F16: // case GGML_TYPE_F16:
print_t_f16(t); // print_t_f16(t);
break; // break;
default: // default:
fprintf(stderr, "%s: unsupported type\n", __func__); // fprintf(stderr, "%s: unsupported type\n", __func__);
exit(1); // exit(1);
} // }
} // }
} //}
if (cgraph_fname) { if (cgraph_fname) {
//ggml_graph_export(&gf, cgraph_fname); ggml_graph_export(&gf, cgraph_fname);
ggml_graph_export(&gf_export, cgraph_fname);
} }
#ifdef GGML_PERF #ifdef GGML_PERF