From b484674707d0d106647afd5f0322c252f3b72690 Mon Sep 17 00:00:00 2001
From: mike dupont <mike.dupont@introspector.local>
Date: Sun, 26 Nov 2023 19:31:56 -0500
Subject: [PATCH] wip

---
 ggml.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)
diff --git a/ggml.cpp b/ggml.cpp
index 82eda26d5..7aa8f66d5 100644
--- a/ggml.cpp
+++ b/ggml.cpp
@@ -9417,7 +9417,69 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 
+#include <iostream>
+#include <algorithm>
+#include <unordered_map> // hash table
+#include <numeric>       // for copying and iterating over arrays
+
 void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor);
+void ggml_tensor_hash(const char * name,const struct ggml_tensor * tensor, int decimalPlace);
+#include "ggml-backend-impl.h"
+// helper function to convert the tensor buffer to a float array
+float* ggml_tensor_to_float(const ggml_tensor& tensor, size_t* out_size) {
+  //if (tensor->type != GGML_TYPE_FLOAT) {
+  //throw std::runtime_error("Only support for floating-point tensors");
+  //}
+    const size_t num_elements = tensor->n_dims > 0 ? std::accumulate(tensor->nb, tensor->nb + tensor->n_dims, 1) : 0;
+    float* buffer = new float[num_elements];
+    if (out_size) {
+        *out_size = num_elements;
+    }
+    memcpy(buffer, ggml_get_data_f32(tensor), ggml_nbytes(tensor));
+    //memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
+    return buffer;
+}
+
+// function to create a hash table of the N most common values of a given tensor
+std::vector<double> find_n_most_common_values(const ggml_tensor& tensor, int decimal_place, size_t top_n) {
+    float* buffer = ggml_tensor_to_float(tensor, nullptr);
+    auto values = std::unordered_map<double, int>(); // hash table to store the count of each value
+
+    if (decimal_place <= 0 || top_n <= 0) {
+        throw std::runtime_error("Invalid parameters: decimal_place and top_n must be positive integers");
+    }
+
+    // find N most common values by counting the frequency of each value with truncated decimal places
+    for (size_t i = 0; i < buffer->size(); ++i) {
+        const double value = std::pow(10, static_cast<double>(decimal_place));
+        buffer[i] *= value; // multiply by value to truncate decimal places
+        int count = values.find(buffer[i])->second + 1;
+        if (count > top_n) {
+            continue;
+        }
+        if (decimal_place <= 0 || count >= top_n) {
+            break;
+        }
+    }
+
+    // sort the values in descending order of frequency
+    auto it = values.begin();
+    std::vector<double> n_most_common(top_n);
+    size_t j = 0;
+    while (it != values.end() && j < top_n) {
+        const int count = it->second;
+        if (count <= top_n - j) {
+            break;
+        }
+        n_most_common[j++] = it->first;
+        it++;
+    }
+
+    delete[] buffer;
+    return n_most_common;
+}
+
+
 void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) {
   const int64_t ne = ggml_nelements(tensor) ;
   float fmin=0;
@@ -9425,6 +9487,16 @@ void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) {
   float fmax=0;
   float fsum=0;
 
+  const int top_n=10;
+  const int decimal_place = 5;
+  
+  auto n_most_common_values = find_n_most_common_values(tensor, decimal_place, top_n);
+  std::cout << "N most common values with decimal places " << decimal_place << ": ";
+  for (const auto& value : n_most_common_values) {
+    std::cout << value << " ";
+  }
+  std::cout << std::endl;
+
   for (int64_t j = 0; j < ne; ++j) {
     float f = ggml_get_f32_1d(tensor, j);
     if (j ==0) {
@@ -9442,13 +9514,14 @@ void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) {
   }
 
   auto type_name = ggml_type_name(tensor->type);
-// color_name
-  fprintf(stderr, "JSON: { \"name1\" :\"%s\", \"cnt\":\"%ld\", \"first\":\"%f\",\"max\":\"%f\",\"min\":\"%f\",\"sum\":\"%f\", \"name\":\"%s\", \"type\":\"%s\"}\n",
+  float fmean = fsum / ne;
+  fprintf(stderr, "JSON: {\"name1\":\"%s\",\"cnt\":\"%ld\",\"first\":\"%f\",\"max\":\"%f\",\"min\":\"%f\",\"mean\":\"%f\",\"sum\":\"%f\",\"name\":\"%s\",\"type\":\"%s\"}\n",
 	  name,
 	  ne,
 	  ffirst,
 	  fmax,
 	  fmin,
+	  fmean,
 	  fsum,
 	  tensor->name,
 	  std::string(type_name).c_str()