From b484674707d0d106647afd5f0322c252f3b72690 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sun, 26 Nov 2023 19:31:56 -0500 Subject: [PATCH] wip --- ggml.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/ggml.cpp b/ggml.cpp index 82eda26d5..7aa8f66d5 100644 --- a/ggml.cpp +++ b/ggml.cpp @@ -9417,7 +9417,69 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif +#include +#include +#include // hash table +#include // for copying and iterating over arrays + void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor); +void ggml_tensor_hash(const char * name,const struct ggml_tensor * tensor, int decimalPlace); +#include "ggml-backend-impl.h" +// helper function to convert the tensor buffer to a float array +float* ggml_tensor_to_float(const ggml_tensor& tensor, size_t* out_size) { + //if (tensor->type != GGML_TYPE_FLOAT) { + //throw std::runtime_error("Only support for floating-point tensors"); + //} + const size_t num_elements = tensor->n_dims > 0 ? std::accumulate(tensor->nb, tensor->nb + tensor->n_dims, 1) : 0; + float* buffer = new float[num_elements]; + if (out_size) { + *out_size = num_elements; + } + memcpy(buffer, ggml_get_data_f32(tensor), ggml_nbytes(tensor)); + //memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings)); + return buffer; +} + +// function to create a hash table of the N most common values of a given tensor +std::vector find_n_most_common_values(const ggml_tensor& tensor, int decimal_place, size_t top_n) { + float* buffer = ggml_tensor_to_float(tensor, nullptr); + auto values = std::unordered_map(); // hash table to store the count of each value + + if (decimal_place <= 0 || top_n <= 0) { + throw std::runtime_error("Invalid parameters: decimal_place and top_n must be positive integers"); + } + + // find N most common values by counting the frequency of each value with truncated decimal places + for (size_t i = 0; i < buffer->size(); ++i) { + const double value = std::pow(10, static_cast(decimal_place)); + buffer[i] *= value; // multiply by value to truncate decimal places + int count = values.find(buffer[i])->second + 1; + if (count > top_n) { + continue; + } + if (decimal_place <= 0 || count >= top_n) { + break; + } + } + + // sort the values in descending order of frequency + auto it = values.begin(); + std::vector n_most_common(top_n); + size_t j = 0; + while (it != values.end() && j < top_n) { + const int count = it->second; + if (count <= top_n - j) { + break; + } + n_most_common[j++] = it->first; + it++; + } + + delete[] buffer; + return n_most_common; +} + + void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) { const int64_t ne = ggml_nelements(tensor) ; float fmin=0; @@ -9425,6 +9487,16 @@ void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) { float fmax=0; float fsum=0; + const int top_n=10; + const int decimal_place = 5; + + auto n_most_common_values = find_n_most_common_values(tensor, decimal_place, top_n); + std::cout << "N most common values with decimal places " << decimal_place << ": "; + for (const auto& value : n_most_common_values) { + std::cout << value << " "; + } + std::cout << std::endl; + for (int64_t j = 0; j < ne; ++j) { float f = ggml_get_f32_1d(tensor, j); if (j ==0) { @@ -9442,13 +9514,14 @@ void ggml_tensor_checksum(const char * name,const struct ggml_tensor * tensor) { } auto type_name = ggml_type_name(tensor->type); -// color_name - fprintf(stderr, "JSON: { \"name1\" :\"%s\", \"cnt\":\"%ld\", \"first\":\"%f\",\"max\":\"%f\",\"min\":\"%f\",\"sum\":\"%f\", \"name\":\"%s\", \"type\":\"%s\"}\n", + float fmean = fsum / ne; + fprintf(stderr, "JSON: {\"name1\":\"%s\",\"cnt\":\"%ld\",\"first\":\"%f\",\"max\":\"%f\",\"min\":\"%f\",\"mean\":\"%f\",\"sum\":\"%f\",\"name\":\"%s\",\"type\":\"%s\"}\n", name, ne, ffirst, fmax, fmin, + fmean, fsum, tensor->name, std::string(type_name).c_str()