From d85a629a6c2efdb2fa257305585c7ac66faed020 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Mon, 22 Jan 2024 23:28:52 +0100
Subject: [PATCH 01/10] Update ggml.c

---
 ggml.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)

diff --git a/ggml.c b/ggml.c
index f85045c9c..267e99c58 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20158,6 +20158,233 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
+
+    char *tensor_data;
+    if (tensor->backend != GGML_BACKEND_CPU) {
+        // for any mmap solution we can actually set the CPU data of a tensor during load even if it's GPU offloaded
+        // this shouldn't have a negative effect, worked well in ggllm, saves the need of tensor_get operations for weights
+        if (tensor->buffer == NULL) {
+            printf("ggml_printTensorSample: tensor buffer is NULL\n");
+            return;
+        }
+        tensor_data = (char *) malloc(ggml_nbytes(tensor));
+        ggml_backend_tensor_get(tensor, tensor_data, 0, ggml_nbytes(tensor));
+    } else
+    {
+        tensor_data = tensor->data;
+        if (tensor_data == NULL) {
+            printf("ggml_printTensorSample: tensor data is NULL\n");
+            return;
+        }
+    }
+        
+    const char *sep = "+-------------------------------------------------------------------------------------------+\n";
+    printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
+    
+    const int MAX_ELEMENTS_ROW = 10;
+    const int MAX_ELEMENTS_COL = 6;
+    const int MAX_ELEMENTS_LAYER = 3;  // layered
+    const int MAX_ELEMENTS_BATCH = 2;   // repeated display
+    const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};
+
+    printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
+    printf("| Total Elements : [ ");
+    for (int i = 0; i < ggml_n_dims(tensor); i++)
+        printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
+    printf("]\n%s", sep);
+
+    int n_dims = ggml_n_dims(tensor);
+
+    if (n_dims == 1) {
+        printf("| 1: ");
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("%-7.3f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
+        printf("\n%s", sep);
+    }
+    else if (n_dims == 2) {
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("%-7.3f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
+                if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
+            }
+            printf("\n");
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     .. additional rows\n");
+        printf("%s", sep);
+    } else if(n_dims == 3) {
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| Row %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("[");
+                for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
+                    printf("%-7.3f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                        printf(", ");
+                }
+                if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
+                printf("] ");
+            }
+            printf("\n");
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     ... additional layers\n");
+        printf("%s", sep);
+    } else if(n_dims == 4) {
+        for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
+            printf("Batch %d\n", batch+1);
+            for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
+                printf("| Row %d: ", i+1);
+                for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
+                    printf("[");
+                    for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
+                        printf("%-7.3f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                            printf(", ");
+                    }
+                    if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
+                    printf("] ");
+                }
+                printf("\n");
+            }
+            if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf("     ... additional batches\n");
+            printf("%s", sep);
+        }
+    }
+    if (tensor->backend != GGML_BACKEND_CPU)
+        free(tensor_data);
+}
+
+void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample) {
+    char tmp_str[256] = {0};
+    int pos=0;
+    const char *sep = "+----------------------+----------------------+----------------------+----------------------+";
+    const char *sep_border = "+======================+======================+======================+======================+";
+    printf("%s\n",  sep_border);
+    printf("| %s:%d\n", prefix,line);
+    printf("| %-32s [%s type]\n",  tensor->name, ggml_type_name(tensor->type));
+    printf("%s\n",  sep);
+    char strides[256] = {0};
+    /**
+        // nb[0] = sizeof(type)
+        // nb[1] = nb[0]   * ne[0] + padding
+        // nb[i] = nb[i-1] * ne[i-1]
+    */
+    {
+        strides[0] = '\0';
+        for (int i = 0; i < ggml_n_dims(tensor); i++) {
+            char dim_str[20];
+            snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
+            strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
+            if (i != ggml_n_dims(tensor) - 1) {
+                strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
+            }
+        }
+    }
+
+    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Dimensions", "Strides", "Layer id", "Backend");
+    int layer_id=-1; // tensor->meta structure not available
+    printf("| %-20d | %-20s | %-20d | %-20s |\n",  ggml_n_dims(tensor), strides, layer_id, tensor->backend == GGML_BACKEND_CPU ? "CPU" : ((tensor->backend == GGML_BACKEND_GPU) ? "GPU" : "GPU_SPLIT"));
+    printf("%s\n",  sep);
+    pos = 0;
+    for (int i = 0; i < ggml_n_dims(tensor); i++) {
+        pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->ne[i]);
+        if (i != ggml_n_dims(tensor) - 1) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+        }
+    }
+    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Elements", "Src0", "Src1","Operation");
+    printf("| %-20s |",  tmp_str);
+        
+    if (tensor->src[0]) {
+        pos = 0;
+        for (int i = 0; i < ggml_n_dims(tensor->src[0]); i++) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[0]->ne[i]);
+            if (i != ggml_n_dims(tensor->src[0]) - 1) {
+                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+            }
+        }
+        printf(" %-20s |", tmp_str);
+    } else {
+        printf(" %-20s |", "N/A");
+    }
+    if (tensor->src[1]) {
+        pos = 0;
+        for (int i = 0; i < ggml_n_dims(tensor->src[1]); i++) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[1]->ne[i]);
+            if (i != ggml_n_dims(tensor->src[1]) - 1) {
+                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+            }
+        }
+        printf(" %-20s |", tmp_str);
+    } else {
+        printf(" %-20s |", "N/A");
+    }
+    printf(" %-20s |", tensor->op > 0 ? GGML_OP_NAME[tensor->op] : "N/A");
+    printf("\n");
+    printf("%s\n",  sep);
+
+    if (extended) {
+        bool is_transposed = ggml_is_transposed(tensor);
+        bool is_permuted = ggml_is_permuted(tensor);
+        bool is_cont = ggml_is_contiguous(tensor);
+        printf("| %-17s%s | %-17s%s | %-17s%s | %-6s%11.2f MB |\n", "Transposed:", is_transposed ? "Yes" : "No ", "Permuted:", is_permuted ? "Yes" : "No ", "Contiguous:", is_cont ? "Yes" : "No ","Size:", ggml_nbytes(tensor)/(1024.0*1024.0));
+    }
+
+    if (extended) {
+        if (tensor->src[0] && strlen(tensor->src[0]->name)) {
+            printf("| %-20s | ", "Src0 name:");
+            printf("%-66s |\n", tensor->src[0]->name);
+        }
+        if (tensor->src[1] && strlen(tensor->src[1]->name)) {
+            printf("| %-20s | ", "Src1 name:");
+            printf("%-66s |\n", tensor->src[1]->name);
+        }
+        printf("%s\n\n",  sep);
+    }
+
+    if (print_sample) {
+        if (extended) {
+            if (tensor->src[0] && tensor->src[0]->ne[0]) {
+                ggml_printTensorSample("src0", tensor->src[0]);
+            }
+            if (tensor->src[1] && tensor->src[1]->ne[0]) {
+                ggml_printTensorSample("src1", tensor->src[1]);
+            }
+        }
+        ggml_printTensorSample("dst", tensor);
+    }
+    printf("%s\n",  sep_border);
+}
+
+float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4) {
+    int n_dims = ggml_n_dims(tensor);
+    if (n_dims < 1 || n_dims > 4) {
+        printf("Error: Incorrect dimension number %d\n", n_dims);
+        return -1; // handle error
+    }
+
+    int indices[4] = {ind1, ind2, ind3, ind4};
+    int total_offset = 0;
+
+    for (int i = 0; i < n_dims; i++) {
+        if (indices[i] >= tensor->ne[i] || indices[i] < 0) {
+            printf("Error: Incorrect index for dimension %d\n", i);
+            printf("Index: %d, Dimension size: %" PRId64 "\n", indices[i], tensor->ne[i]);
+            return -1; // handle error
+        }
+
+        total_offset += indices[i] * tensor->nb[i];
+    }
+
+    // Return the value at the calculated offset
+    return *(float *)((char *) tensor->data + total_offset);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
     return 1;

From 607fbe99c7c3d93bed911ffe612d6c211f4bda80 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Mon, 22 Jan 2024 23:31:24 +0100
Subject: [PATCH 02/10] Update ggml.h

---
 ggml.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml.h b/ggml.h
index dca7bd9ce..6c501e170 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1899,6 +1899,10 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
+    // visualize the tensor - extended adds more information - when printing sample content extended will also print src0 and src1 content
+    // example: ggml_tensor_printf(some_ggml_tensor,"function_name",0,true,true);
+    void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample);
+
     // build gradient checkpointing backward graph gb for gf using provided checkpoints
     // gb_tmp will contain original backward graph with rewritten backward process nodes,
     // but without the second forward pass nodes.

From 31bfd4a52bb369e3f47963d554742fa459f291be Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 00:26:48 +0100
Subject: [PATCH 03/10] Update ggml.c

changed sample precision to 7.4.
Optimally this would be a parameter to pass to the function.
7.3 often shows just -0.000 or 0.000 on weights

A higher precision would allow to see more in these type of small value tensors but it would also display less.
---
 ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 267e99c58..376d8fe98 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20199,7 +20199,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     if (n_dims == 1) {
         printf("| 1: ");
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("%-7.3f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
+            printf("%-7.4f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
         }
         if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
         printf("\n%s", sep);
@@ -20208,7 +20208,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
             printf("| %d: ", i+1);
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
-                printf("%-7.3f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
+                printf("%-7.4f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
                 if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
             }
             printf("\n");
@@ -20221,7 +20221,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
                 printf("[");
                 for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
-                    printf("%-7.3f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    printf("%-7.4f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
                     if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
                         printf(", ");
                 }
@@ -20240,7 +20240,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
                 for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
                     printf("[");
                     for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
-                        printf("%-7.3f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        printf("%-7.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
                         if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
                             printf(", ");
                     }

From 0fa71d17606ea1d10d83f58c95c5a86a59fcee5a Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:07:19 +0100
Subject: [PATCH 04/10] moved from ggml to ggml-backend - as backend retrieval
 needed but header not available in ggml.c

---
 ggml-backend.c | 202 +++++++++++++++++++++++++++++++++++++++++++
 ggml-backend.h |   4 +
 ggml.c         | 227 -------------------------------------------------
 ggml.h         |   4 -
 4 files changed, 206 insertions(+), 231 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 423512def..8f828564d 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1722,3 +1722,205 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 
     return true;
 }
+
+void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
+
+    char *tensor_data;
+    if (tensor->backend != GGML_BACKEND_CPU) {
+        // for any mmap solution we can actually set the CPU data of a tensor during load even if it's GPU offloaded
+        // this shouldn't have a negative effect, worked well in ggllm, saves the need of tensor_get operations for weights
+        if (tensor->buffer == NULL) {
+            printf("ggml_printTensorSample: tensor buffer is NULL\n");
+            return;
+        }
+        tensor_data = (char *) malloc(ggml_nbytes(tensor));
+        ggml_backend_tensor_get(tensor, tensor_data, 0, ggml_nbytes(tensor));
+    } else
+    {
+        tensor_data = tensor->data;
+        if (tensor_data == NULL) {
+            printf("ggml_printTensorSample: tensor data is NULL\n");
+            return;
+        }
+    }
+        
+    const char *sep = "+-------------------------------------------------------------------------------------------+\n";
+    printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
+    
+    const int MAX_ELEMENTS_ROW = 10;
+    const int MAX_ELEMENTS_COL = 6;
+    const int MAX_ELEMENTS_LAYER = 3;  // layered
+    const int MAX_ELEMENTS_BATCH = 2;   // repeated display
+    const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};
+
+    printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
+    printf("| Total Elements : [ ");
+    for (int i = 0; i < ggml_n_dims(tensor); i++)
+        printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
+    printf("]\n%s", sep);
+
+    int n_dims = ggml_n_dims(tensor);
+
+    if (n_dims == 1) {
+        printf("| 1: ");
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("%-7.4f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
+        printf("\n%s", sep);
+    }
+    else if (n_dims == 2) {
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("%-7.4f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
+                if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
+            }
+            printf("\n");
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     .. additional rows\n");
+        printf("%s", sep);
+    } else if(n_dims == 3) {
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| Row %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("[");
+                for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
+                    printf("%-7.4f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                        printf(", ");
+                }
+                if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
+                printf("] ");
+            }
+            printf("\n");
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     ... additional layers\n");
+        printf("%s", sep);
+    } else if(n_dims == 4) {
+        for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
+            printf("Batch %d\n", batch+1);
+            for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
+                printf("| Row %d: ", i+1);
+                for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
+                    printf("[");
+                    for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
+                        printf("%-7.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                            printf(", ");
+                    }
+                    if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
+                    printf("] ");
+                }
+                printf("\n");
+            }
+            if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf("     ... additional batches\n");
+            printf("%s", sep);
+        }
+    }
+    if (tensor->backend != GGML_BACKEND_CPU)
+        free(tensor_data);
+}
+
+void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample) {
+    char tmp_str[256] = {0};
+    int pos=0;
+    const char *sep = "+----------------------+----------------------+----------------------+----------------------+";
+    const char *sep_border = "+======================+======================+======================+======================+";
+    printf("%s\n",  sep_border);
+    printf("| %s:%d\n", prefix,line);
+    printf("| %-32s [%s type]\n",  tensor->name, ggml_type_name(tensor->type));
+    printf("%s\n",  sep);
+    char strides[256] = {0};
+    /**
+        // nb[0] = sizeof(type)
+        // nb[1] = nb[0]   * ne[0] + padding
+        // nb[i] = nb[i-1] * ne[i-1]
+    */
+    {
+        strides[0] = '\0';
+        for (int i = 0; i < ggml_n_dims(tensor); i++) {
+            char dim_str[20];
+            snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
+            strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
+            if (i != ggml_n_dims(tensor) - 1) {
+                strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
+            }
+        }
+    }
+
+    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Dimensions", "Strides", "Layer id", "Backend");
+    int layer_id=-1; // tensor->meta structure not available
+    printf("| %-20d | %-20s | %-20d | %-20s |\n",  ggml_n_dims(tensor), strides, layer_id, tensor->backend == GGML_BACKEND_CPU ? "CPU" : ((tensor->backend == GGML_BACKEND_GPU) ? "GPU" : "GPU_SPLIT"));
+    printf("%s\n",  sep);
+    pos = 0;
+    for (int i = 0; i < ggml_n_dims(tensor); i++) {
+        pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->ne[i]);
+        if (i != ggml_n_dims(tensor) - 1) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+        }
+    }
+    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Elements", "Src0", "Src1","Operation");
+    printf("| %-20s |",  tmp_str);
+        
+    if (tensor->src[0]) {
+        pos = 0;
+        for (int i = 0; i < ggml_n_dims(tensor->src[0]); i++) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[0]->ne[i]);
+            if (i != ggml_n_dims(tensor->src[0]) - 1) {
+                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+            }
+        }
+        printf(" %-20s |", tmp_str);
+    } else {
+        printf(" %-20s |", "N/A");
+    }
+    if (tensor->src[1]) {
+        pos = 0;
+        for (int i = 0; i < ggml_n_dims(tensor->src[1]); i++) {
+            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[1]->ne[i]);
+            if (i != ggml_n_dims(tensor->src[1]) - 1) {
+                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
+            }
+        }
+        printf(" %-20s |", tmp_str);
+    } else {
+        printf(" %-20s |", "N/A");
+    }
+    printf(" %-20s |", tensor->op > 0 ? ggml_op_name(tensor->op) : "N/A");
+    printf("\n");
+    printf("%s\n",  sep);
+
+    if (extended) {
+        bool is_transposed = ggml_is_transposed(tensor);
+        bool is_permuted = ggml_is_permuted(tensor);
+        bool is_cont = ggml_is_contiguous(tensor);
+        printf("| %-17s%s | %-17s%s | %-17s%s | %-6s%11.2f MB |\n", "Transposed:", is_transposed ? "Yes" : "No ", "Permuted:", is_permuted ? "Yes" : "No ", "Contiguous:", is_cont ? "Yes" : "No ","Size:", ggml_nbytes(tensor)/(1024.0*1024.0));
+    }
+
+    if (extended) {
+        if (tensor->src[0] && strlen(tensor->src[0]->name)) {
+            printf("| %-20s | ", "Src0 name:");
+            printf("%-66s |\n", tensor->src[0]->name);
+        }
+        if (tensor->src[1] && strlen(tensor->src[1]->name)) {
+            printf("| %-20s | ", "Src1 name:");
+            printf("%-66s |\n", tensor->src[1]->name);
+        }
+        printf("%s\n\n",  sep);
+    }
+
+    if (print_sample) {
+        if (extended) {
+            if (tensor->src[0] && tensor->src[0]->ne[0]) {
+                ggml_printTensorSample("src0", tensor->src[0]);
+            }
+            if (tensor->src[1] && tensor->src[1]->ne[0]) {
+                ggml_printTensorSample("src1", tensor->src[1]);
+            }
+        }
+        ggml_printTensorSample("dst", tensor);
+    }
+    printf("%s\n",  sep_border);
+}
+
diff --git a/ggml-backend.h b/ggml-backend.h
index ab4ad773f..d20461d82 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -203,6 +203,10 @@ extern "C" {
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
     GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 
+    // Tensor Debug
+    // visualize the tensor - extended adds more information - when printing sample content extended will also print src0 and src1 content
+    GGML_API void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample);
+
 
 #ifdef  __cplusplus
 }
diff --git a/ggml.c b/ggml.c
index 376d8fe98..f85045c9c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20158,233 +20158,6 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
-
-    char *tensor_data;
-    if (tensor->backend != GGML_BACKEND_CPU) {
-        // for any mmap solution we can actually set the CPU data of a tensor during load even if it's GPU offloaded
-        // this shouldn't have a negative effect, worked well in ggllm, saves the need of tensor_get operations for weights
-        if (tensor->buffer == NULL) {
-            printf("ggml_printTensorSample: tensor buffer is NULL\n");
-            return;
-        }
-        tensor_data = (char *) malloc(ggml_nbytes(tensor));
-        ggml_backend_tensor_get(tensor, tensor_data, 0, ggml_nbytes(tensor));
-    } else
-    {
-        tensor_data = tensor->data;
-        if (tensor_data == NULL) {
-            printf("ggml_printTensorSample: tensor data is NULL\n");
-            return;
-        }
-    }
-        
-    const char *sep = "+-------------------------------------------------------------------------------------------+\n";
-    printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
-    
-    const int MAX_ELEMENTS_ROW = 10;
-    const int MAX_ELEMENTS_COL = 6;
-    const int MAX_ELEMENTS_LAYER = 3;  // layered
-    const int MAX_ELEMENTS_BATCH = 2;   // repeated display
-    const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};
-
-    printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
-    printf("| Total Elements : [ ");
-    for (int i = 0; i < ggml_n_dims(tensor); i++)
-        printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
-    printf("]\n%s", sep);
-
-    int n_dims = ggml_n_dims(tensor);
-
-    if (n_dims == 1) {
-        printf("| 1: ");
-        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("%-7.4f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
-        }
-        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
-        printf("\n%s", sep);
-    }
-    else if (n_dims == 2) {
-        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("| %d: ", i+1);
-            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
-                printf("%-7.4f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
-                if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
-            }
-            printf("\n");
-        }
-        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     .. additional rows\n");
-        printf("%s", sep);
-    } else if(n_dims == 3) {
-        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("| Row %d: ", i+1);
-            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
-                printf("[");
-                for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
-                    printf("%-7.4f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
-                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
-                        printf(", ");
-                }
-                if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
-                printf("] ");
-            }
-            printf("\n");
-        }
-        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     ... additional layers\n");
-        printf("%s", sep);
-    } else if(n_dims == 4) {
-        for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
-            printf("Batch %d\n", batch+1);
-            for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
-                printf("| Row %d: ", i+1);
-                for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
-                    printf("[");
-                    for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
-                        printf("%-7.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
-                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
-                            printf(", ");
-                    }
-                    if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
-                    printf("] ");
-                }
-                printf("\n");
-            }
-            if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf("     ... additional batches\n");
-            printf("%s", sep);
-        }
-    }
-    if (tensor->backend != GGML_BACKEND_CPU)
-        free(tensor_data);
-}
-
-void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample) {
-    char tmp_str[256] = {0};
-    int pos=0;
-    const char *sep = "+----------------------+----------------------+----------------------+----------------------+";
-    const char *sep_border = "+======================+======================+======================+======================+";
-    printf("%s\n",  sep_border);
-    printf("| %s:%d\n", prefix,line);
-    printf("| %-32s [%s type]\n",  tensor->name, ggml_type_name(tensor->type));
-    printf("%s\n",  sep);
-    char strides[256] = {0};
-    /**
-        // nb[0] = sizeof(type)
-        // nb[1] = nb[0]   * ne[0] + padding
-        // nb[i] = nb[i-1] * ne[i-1]
-    */
-    {
-        strides[0] = '\0';
-        for (int i = 0; i < ggml_n_dims(tensor); i++) {
-            char dim_str[20];
-            snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
-            strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
-            if (i != ggml_n_dims(tensor) - 1) {
-                strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
-            }
-        }
-    }
-
-    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Dimensions", "Strides", "Layer id", "Backend");
-    int layer_id=-1; // tensor->meta structure not available
-    printf("| %-20d | %-20s | %-20d | %-20s |\n",  ggml_n_dims(tensor), strides, layer_id, tensor->backend == GGML_BACKEND_CPU ? "CPU" : ((tensor->backend == GGML_BACKEND_GPU) ? "GPU" : "GPU_SPLIT"));
-    printf("%s\n",  sep);
-    pos = 0;
-    for (int i = 0; i < ggml_n_dims(tensor); i++) {
-        pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->ne[i]);
-        if (i != ggml_n_dims(tensor) - 1) {
-            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
-        }
-    }
-    printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Elements", "Src0", "Src1","Operation");
-    printf("| %-20s |",  tmp_str);
-        
-    if (tensor->src[0]) {
-        pos = 0;
-        for (int i = 0; i < ggml_n_dims(tensor->src[0]); i++) {
-            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[0]->ne[i]);
-            if (i != ggml_n_dims(tensor->src[0]) - 1) {
-                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
-            }
-        }
-        printf(" %-20s |", tmp_str);
-    } else {
-        printf(" %-20s |", "N/A");
-    }
-    if (tensor->src[1]) {
-        pos = 0;
-        for (int i = 0; i < ggml_n_dims(tensor->src[1]); i++) {
-            pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, "%" PRId64, tensor->src[1]->ne[i]);
-            if (i != ggml_n_dims(tensor->src[1]) - 1) {
-                pos += snprintf(tmp_str + pos, sizeof(tmp_str) - pos, " x ");
-            }
-        }
-        printf(" %-20s |", tmp_str);
-    } else {
-        printf(" %-20s |", "N/A");
-    }
-    printf(" %-20s |", tensor->op > 0 ? GGML_OP_NAME[tensor->op] : "N/A");
-    printf("\n");
-    printf("%s\n",  sep);
-
-    if (extended) {
-        bool is_transposed = ggml_is_transposed(tensor);
-        bool is_permuted = ggml_is_permuted(tensor);
-        bool is_cont = ggml_is_contiguous(tensor);
-        printf("| %-17s%s | %-17s%s | %-17s%s | %-6s%11.2f MB |\n", "Transposed:", is_transposed ? "Yes" : "No ", "Permuted:", is_permuted ? "Yes" : "No ", "Contiguous:", is_cont ? "Yes" : "No ","Size:", ggml_nbytes(tensor)/(1024.0*1024.0));
-    }
-
-    if (extended) {
-        if (tensor->src[0] && strlen(tensor->src[0]->name)) {
-            printf("| %-20s | ", "Src0 name:");
-            printf("%-66s |\n", tensor->src[0]->name);
-        }
-        if (tensor->src[1] && strlen(tensor->src[1]->name)) {
-            printf("| %-20s | ", "Src1 name:");
-            printf("%-66s |\n", tensor->src[1]->name);
-        }
-        printf("%s\n\n",  sep);
-    }
-
-    if (print_sample) {
-        if (extended) {
-            if (tensor->src[0] && tensor->src[0]->ne[0]) {
-                ggml_printTensorSample("src0", tensor->src[0]);
-            }
-            if (tensor->src[1] && tensor->src[1]->ne[0]) {
-                ggml_printTensorSample("src1", tensor->src[1]);
-            }
-        }
-        ggml_printTensorSample("dst", tensor);
-    }
-    printf("%s\n",  sep_border);
-}
-
-float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4) {
-    int n_dims = ggml_n_dims(tensor);
-    if (n_dims < 1 || n_dims > 4) {
-        printf("Error: Incorrect dimension number %d\n", n_dims);
-        return -1; // handle error
-    }
-
-    int indices[4] = {ind1, ind2, ind3, ind4};
-    int total_offset = 0;
-
-    for (int i = 0; i < n_dims; i++) {
-        if (indices[i] >= tensor->ne[i] || indices[i] < 0) {
-            printf("Error: Incorrect index for dimension %d\n", i);
-            printf("Index: %d, Dimension size: %" PRId64 "\n", indices[i], tensor->ne[i]);
-            return -1; // handle error
-        }
-
-        total_offset += indices[i] * tensor->nb[i];
-    }
-
-    // Return the value at the calculated offset
-    return *(float *)((char *) tensor->data + total_offset);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
     return 1;
diff --git a/ggml.h b/ggml.h
index 6c501e170..dca7bd9ce 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1899,10 +1899,6 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
-    // visualize the tensor - extended adds more information - when printing sample content extended will also print src0 and src1 content
-    // example: ggml_tensor_printf(some_ggml_tensor,"function_name",0,true,true);
-    void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample);
-
     // build gradient checkpointing backward graph gb for gf using provided checkpoints
     // gb_tmp will contain original backward graph with rewritten backward process nodes,
     // but without the second forward pass nodes.

From 8ccb0d69cd0f627a94b31ab1412b3f8fb04fe5a4 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:09:19 +0100
Subject: [PATCH 05/10] trailing ws

---
 ggml-backend.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 8f828564d..86218e740 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1743,10 +1743,10 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
             return;
         }
     }
-        
+
     const char *sep = "+-------------------------------------------------------------------------------------------+\n";
     printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
-    
+
     const int MAX_ELEMENTS_ROW = 10;
     const int MAX_ELEMENTS_COL = 6;
     const int MAX_ELEMENTS_LAYER = 3;  // layered
@@ -1787,7 +1787,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
                 printf("[");
                 for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
                     printf("%-7.4f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
-                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                         printf(", ");
                 }
                 if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
@@ -1806,7 +1806,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
                     printf("[");
                     for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
                         printf("%-7.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
-                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                             printf(", ");
                     }
                     if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
@@ -1862,7 +1862,7 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
     }
     printf("| %-20s | %-20s | %-20s | %-20s |\n",  "Elements", "Src0", "Src1","Operation");
     printf("| %-20s |",  tmp_str);
-        
+
     if (tensor->src[0]) {
         pos = 0;
         for (int i = 0; i < ggml_n_dims(tensor->src[0]); i++) {

From 8690363183329f97f509adc92d84916d02a663d7 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:13:39 +0100
Subject: [PATCH 06/10] added inttypes.h

---
 ggml-backend.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-backend.c b/ggml-backend.c
index 86218e740..8180a28c7 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -8,6 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <inttypes.h>
 
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

From 5122c828695708b701f50d5b1e1781b0d14ec727 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:28:02 +0100
Subject: [PATCH 07/10] bugfix

---
 ggml-backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 8180a28c7..3ab7970f4 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1746,7 +1746,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     }
 
     const char *sep = "+-------------------------------------------------------------------------------------------+\n";
-    printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
+    //printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
 
     const int MAX_ELEMENTS_ROW = 10;
     const int MAX_ELEMENTS_COL = 6;

From 582bddc37a526b261bcc6128b4b8221391fa1c73 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:45:27 +0100
Subject: [PATCH 08/10] changed formating to adaptive exponential format

---
 ggml-backend.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 3ab7970f4..81d16d96b 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1746,8 +1746,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     }
 
     const char *sep = "+-------------------------------------------------------------------------------------------+\n";
-    //printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, ggml_n_dims(tensor));
-
+    
     const int MAX_ELEMENTS_ROW = 10;
     const int MAX_ELEMENTS_COL = 6;
     const int MAX_ELEMENTS_LAYER = 3;  // layered
@@ -1765,7 +1764,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     if (n_dims == 1) {
         printf("| 1: ");
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("%-7.4f, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
+            printf("%11.3g, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
         }
         if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
         printf("\n%s", sep);
@@ -1774,7 +1773,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
             printf("| %d: ", i+1);
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
-                printf("%-7.4f ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
+                printf("%11.4g ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
                 if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
             }
             printf("\n");
@@ -1787,7 +1786,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
                 printf("[");
                 for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
-                    printf("%-7.4f",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    printf("%11.4g",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
                     if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                         printf(", ");
                 }
@@ -1806,7 +1805,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
                 for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
                     printf("[");
                     for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
-                        printf("%-7.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        printf("%11.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
                         if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                             printf(", ");
                     }

From 0177431cb2229c97ba7a4af07aa4039969171670 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Tue, 23 Jan 2024 01:46:49 +0100
Subject: [PATCH 09/10] ws

---
 ggml-backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 81d16d96b..b91a8e15b 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1746,7 +1746,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     }
 
     const char *sep = "+-------------------------------------------------------------------------------------------+\n";
-    
+
     const int MAX_ELEMENTS_ROW = 10;
     const int MAX_ELEMENTS_COL = 6;
     const int MAX_ELEMENTS_LAYER = 3;  // layered

From b639e2a73fbf1b67adfe7d87ca3c0250d0f3266b Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 8 Feb 2024 03:00:00 +0100
Subject: [PATCH 10/10] Bugfix printf tensor

Added a function to access the tensor data by index (like an array) and print the value as float/int.
Updated the tensor print loops to use the new function.
This fixes two bugs:
1) the previous printf didn't work after a quickfix
2) the previous printf did not show rows/columns in the right place
---
 ggml-backend.c | 85 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 13 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index f820c3eac..10cc6de0b 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1836,22 +1836,80 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
     return true;
 }
 
-void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
+// printf one number from a tensor (cont only) like an array index[0][1][2][3]
+void ggml_print_f32_index(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data;
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = {i0, i1, i2, i3};
+        int index_cont = i0*tensor->ne[3]*tensor->ne[2]*tensor->ne[1]*tensor->nb[0] + i1*tensor->ne[3]*tensor->ne[2]*tensor->nb[1] + i2*tensor->ne[3]*tensor->nb[2] + i3*tensor->nb[3];
+        ggml_unravel_index(tensor, index_cont, &id[0], &id[1], &id[2], &id[3]); // untested
+        printf("NONCONT");
+        return;
+        i0 = id[0];
+        i1 = id[1];
+        i2 = id[2];
+        i3 = id[3];
+        data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; // untested
+    } else
+    {
+        data = (char *) tensor->data + i0*tensor->ne[3]*tensor->ne[2]*tensor->ne[1]*tensor->nb[0] + i1*tensor->ne[3]*tensor->ne[2]*tensor->nb[1] + i2*tensor->ne[3]*tensor->nb[2] + i3*tensor->nb[3];
+        // void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    }
+    
+    
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            printf("%8d", *((int8_t *) data));
+            break;
+        case GGML_TYPE_I16:
+            printf("%8d", *((int16_t *) data));
+            break;
+        case GGML_TYPE_I32:
+            printf("%8d", *((int32_t *) data));
 
-    char *tensor_data;
-    if (tensor->backend != GGML_BACKEND_CPU) {
+            break;
+        case GGML_TYPE_F16:
+            printf("%8.4f", GGML_FP16_TO_FP32(*((ggml_fp16_t *) data)));
+            break;
+        case GGML_TYPE_F32:
+            printf("%8.4f", *((float *) data));
+            break;
+        default:
+            printf("UNKTYPE");
+            break;
+    }
+}
+void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor_in) {
+    struct ggml_tensor tensor_dummy; // avoid any ctx
+    struct ggml_tensor * tensor = &tensor_dummy;
+
+    if (tensor_in->backend != GGML_BACKEND_CPU) {
         // for any mmap solution we can actually set the CPU data of a tensor during load even if it's GPU offloaded
         // this shouldn't have a negative effect, worked well in ggllm, saves the need of tensor_get operations for weights
-        if (tensor->buffer == NULL) {
+        if (tensor_in->buffer == NULL) {
             printf("ggml_printTensorSample: tensor buffer is NULL\n");
             return;
         }
-        tensor_data = (char *) malloc(ggml_nbytes(tensor));
-        ggml_backend_tensor_get(tensor, tensor_data, 0, ggml_nbytes(tensor));
+        tensor->data = (char *) malloc(ggml_nbytes(tensor_in));
+        ggml_backend_tensor_get(tensor_in, tensor->data, 0, ggml_nbytes(tensor_in));
+        memcpy(tensor->name, tensor_in->name, sizeof(tensor->name));
+        tensor->type = tensor_in->type;
+        tensor->ne[0] = tensor_in->ne[0];
+        tensor->ne[1] = tensor_in->ne[1];
+        tensor->ne[2] = tensor_in->ne[2];
+        tensor->ne[3] = tensor_in->ne[3];
+        tensor->nb[0] = tensor_in->nb[0];
+        tensor->nb[1] = tensor_in->nb[1];
+        tensor->nb[2] = tensor_in->nb[2];
+        tensor->nb[3] = tensor_in->nb[3];
+        tensor->backend = GGML_BACKEND_CPU;
+        tensor->op = tensor_in->op;
+        tensor->view_offs = tensor_in->view_offs;
+        tensor->view_src = tensor_in->view_src;
     } else
     {
-        tensor_data = tensor->data;
-        if (tensor_data == NULL) {
+        tensor = tensor_in;
+        if (tensor->data == NULL) {
             printf("ggml_printTensorSample: tensor data is NULL\n");
             return;
         }
@@ -1876,7 +1934,8 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
     if (n_dims == 1) {
         printf("| 1: ");
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
-            printf("%11.3g, ",  *(double *)((char *) tensor_data + i*tensor->nb[0]));
+            ggml_print_f32_index(tensor, i, 0, 0, 0);
+            if(i == MAX_ELEMENTS_ROW - 1 && tensor->ne[0] > MAX_ELEMENTS_ROW) printf(", ..");
         }
         if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
         printf("\n%s", sep);
@@ -1885,7 +1944,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
         for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
             printf("| %d: ", i+1);
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
-                printf("%11.4g ",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1]));
+                ggml_print_f32_index(tensor, i, j, 0, 0);
                 if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
             }
             printf("\n");
@@ -1898,7 +1957,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
             for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
                 printf("[");
                 for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
-                    printf("%11.4g",  *(double *)((char *) tensor_data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    ggml_print_f32_index(tensor, i, j, k, 0);
                     if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                         printf(", ");
                 }
@@ -1917,7 +1976,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
                 for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
                     printf("[");
                     for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
-                        printf("%11.4f",  *(double *)((char *) tensor_data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        ggml_print_f32_index(tensor, batch, i, j, k);
                         if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1)
                             printf(", ");
                     }
@@ -1931,7 +1990,7 @@ void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tenso
         }
     }
     if (tensor->backend != GGML_BACKEND_CPU)
-        free(tensor_data);
+        free(tensor->data);
 }
 
 void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line,  bool extended, bool print_sample) {