From a266c26de2030b94f608510ba0e70888a9881b76 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 1 Jun 2023 21:27:24 +0300
Subject: [PATCH] mtl : verify V tensor contents

---
 examples/mtl/mtl.m | 23 +++++++++++++++--------
 llama.cpp          | 27 +++++++++++++++++++--------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index bb0074a4c..24f9479ce 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -752,19 +752,26 @@ int llama_mtl_eval(
             }
             printf("sum:  %f\n", sum);
         } else if (t->type == GGML_TYPE_F16) {
-            const ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
+            ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
             printf("data: ");
-            int n = ggml_nelements(t);
-            if (n > 10) {
-                n = 10;
-            }
-            for (int i = 0; i < n; i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", ggml_fp16_to_fp32(data[i]));
             }
             printf("\n");
             double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                sum += ggml_fp16_to_fp32(data[i]);
+            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+                            const float curf = ggml_fp16_to_fp32(cur);
+                            if (isinf(curf)) continue;
+                            sum += curf;
+                        }
+                    }
+                }
             }
             printf("sum:  %f\n", sum);
         } else {
diff --git a/llama.cpp b/llama.cpp
index 6825636c8..2cf5a36fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1341,11 +1341,6 @@ static bool llama_eval_internal(
             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(KQ_soft_max, "mtl-check");
-            }
-
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
@@ -1355,6 +1350,11 @@ static bool llama_eval_internal(
                         il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
             ggml_set_name(V, "V");
 
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(V, "mtl-check");
+            }
+
 #if 1
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             ggml_set_name(KQV, "KQV");
@@ -1479,13 +1479,24 @@ static bool llama_eval_internal(
         auto print_t_f16 = [&](struct ggml_tensor * t) {
             ggml_fp16_t * data = (ggml_fp16_t *)t->data;
             printf("data: ");
-            for (int i = 0; i < std::min((int) t->ne[0], 10); i++) {
+            for (int i = 0; i < (int) t->ne[0]; i++) {
                 printf("%f ", ggml_fp16_to_fp32(data[i]));
             }
             printf("\n");
             double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                sum += ggml_fp16_to_fp32(data[i]);
+            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+                            const float curf = ggml_fp16_to_fp32(cur);
+                            if (isinf(curf)) continue;
+                            sum += curf;
+                        }
+                    }
+                }
             }
             printf("sum:  %f\n", sum);
         };