From 8fed75edca306996364aa85133a284df152d13bd Mon Sep 17 00:00:00 2001
From: brian khuu <mofosyne@gmail.com>
Date: Sat, 6 Jul 2024 21:48:24 +1000
Subject: [PATCH] llama-gguf-hash: change argument from xxhash --> xxh64 and
 update readme

---
 examples/gguf-hash/README.md            | 56 +++++++++++++++++++++----
 examples/gguf-hash/deps/xxhash/xxhash.h |  2 +-
 examples/gguf-hash/gguf-hash.cpp        | 38 ++++++++---------
 3 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md
index 1b23e8dd3..ce0d80943 100644
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@@ -1,31 +1,71 @@
-## GGUF hash Example
 
-CLI to hash GGUF files.
+# llama-gguf-hash
+
+CLI to hash GGUF files to detect difference on a per model and per tensor level.
 
 **Command line options:**
 
-- `--xxhash`: use xhash (default)
+- `--xxh64`: use xhash 64bit hash mode (default)
 - `--sha1`: use sha1
 - `--uuid`: use uuid
 - `--sha256`: use sha256
 
-### Compile Example
+## About
 
-```
+While most POSIX systems already have hash checking programs like sha256sum, it
+is designed to check entire files. This is not ideal for our purpose if we want
+to check for consistency of the tensor data even if the metadata content of the
+gguf KV store has been updated.
+
+This program is designed to hash a gguf tensor payload on a 'per tensor layer'
+in addition to a 'entire tensor model' hash. The intent is that the entire
+tensor layer can be checked first but if there is any detected inconsistencies,
+then the per tensor hash can be used to narrow down the specific tensor layer
+that has inconsistencies.
+
+For Maintainers:
+- Detection of tensor inconsistency during development and automated tests
+    - This is served by xxh64 which is fast
+    - This is also served by having per tensor layer to assist in narrowing down
+      the location of the faulty tensor layer
+    - This is also served by sha1 which is much slower but more widely supported
+
+For Model Creators:
+- Optional consistent UUID generation based on model tensor content
+    - This is served by UUIDv5 which is useful for databases keys
+        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
+            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
+
+For Model Users:
+- Assurance of tensor layer integrity even if metadata was updated
+    - This is served by sha256 which is still considered very secure as of 2024
+
+### Design Note
+
+- The default behavior of this program if no arguments is provided is to hash
+  using xxhash's xxh32 mode because it is very fast and is primarily targeted
+  towards maintainers who may want to use this in automated tests.
+- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
+  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
+  would have a better affinity to calculating hash that is 64bit in size.
+
+## Compile Example
+
+```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
 make -C build clean
 make -C build llama-gguf-hash VERBOSE=1
 ./build/bin/llama-gguf-hash test.gguf
-./build/bin/llama-gguf-hash --xxhash test.gguf
+./build/bin/llama-gguf-hash --xxh64 test.gguf
 ./build/bin/llama-gguf-hash --sha1 test.gguf
 ./build/bin/llama-gguf-hash --uuid test.gguf
 ./build/bin/llama-gguf-hash --sha256 test.gguf
 ```
 
-### Crypto/Hash Libraries Used
+## Crypto/Hash Libraries Used
 
 These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
 
-- https://github.com/mofosyne/xxHash
+- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
 - https://github.com/clibs/sha1/
 - https://github.com/jb55/sha256.c
diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h
index 599fea292..c0fafe20d 100644
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@@ -1687,7 +1687,7 @@ struct XXH64_state_s {
 
 #ifndef XXH_NO_XXH3
 
-/* Windows SDK under 10.0.22000 check is missing stdalign.h so we add a check
+/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check
    before allowing the windows compiler to use the C11 form.
    Reference: https://github.com/Cyan4973/xxHash/issues/955 */
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
index c5fa7af57..cfb0cc258 100644
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -30,7 +30,7 @@ extern "C" {
 
 struct hash_params {
     std::string input;
-    bool xxhash = false;
+    bool xxh64 = false;
     bool sha1 = false;
     bool uuid = false;
     bool sha256 = false;
@@ -45,7 +45,7 @@ static void hash_print_usage(const char * executable) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help              show this help message and exit\n");
-    printf("      --xxhash            use xxhash\n");
+    printf("      --xxh64             use xxh64\n");
     printf("      --sha1              use sha1\n");
     printf("      --uuid              use uuid\n");
     printf("      --sha256            use sha256\n");
@@ -69,9 +69,9 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
             exit(0);
         }
 
-        if (arg == "--xxhash") {
+        if (arg == "--xxh64") {
             arg_found = true;
-            params.xxhash = true;
+            params.xxh64 = true;
         }
 
         if (arg == "--sha1") {
@@ -94,13 +94,13 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
         }
     }
 
-    if (!params.xxhash
+    if (!params.xxh64
             && !params.sha1
             && !params.uuid
             && !params.sha256
         ) {
-        // By default if no swich argument provided, assume xxhash
-        params.xxhash = true;
+        // By default if no swich argument provided, assume xxh64
+        params.xxh64 = true;
     }
 
     if (argc - arg_idx < 1) {
@@ -132,16 +132,16 @@ static bool gguf_hash(const hash_params & hash_params) {
         /*.ctx      = */ &ctx_data,
     };
 
-    // xxhash init
-    XXH64_state_t* xxhash_model_hash_state = NULL;
-    if (hash_params.xxhash) {
-        xxhash_model_hash_state = XXH64_createState();
-        if (xxhash_model_hash_state==NULL) {
+    // xxh64 init
+    XXH64_state_t* xxh64_model_hash_state = NULL;
+    if (hash_params.xxh64) {
+        xxh64_model_hash_state = XXH64_createState();
+        if (xxh64_model_hash_state==NULL) {
             abort();
         }
 
         XXH64_hash_t const seed = 0;
-        if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) {
+        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
             abort();
         }
     }
@@ -166,7 +166,7 @@ static bool gguf_hash(const hash_params & hash_params) {
         auto n_bytes = ggml_nbytes(cur);
         auto *raw_data = cur->data;
 
-        if (hash_params.xxhash) {
+        if (hash_params.xxh64) {
 
             // Per Layer Hash
             XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
@@ -177,10 +177,10 @@ static bool gguf_hash(const hash_params & hash_params) {
                 sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
             }
 
-            printf("xxhash  %s  %s:%s\n", hex_result, fname.c_str(), name);
+            printf("xxh64   %s  %s:%s\n", hex_result, fname.c_str(), name);
 
             // Overall Model Hash
-            if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
+            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
         }
 
         if (hash_params.sha1) {
@@ -218,8 +218,8 @@ static bool gguf_hash(const hash_params & hash_params) {
         }
     }
 
-    if (hash_params.xxhash) {
-        XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state);
+    if (hash_params.xxh64) {
+        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
 
         char hex_result[17];
         for (int  offset = 0; offset < 8; offset++) {
@@ -227,7 +227,7 @@ static bool gguf_hash(const hash_params & hash_params) {
             sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
         }
 
-        printf("xxhash  %s  %s\n", hex_result, fname.c_str());
+        printf("xxh64   %s  %s\n", hex_result, fname.c_str());
     }
 
     if (hash_params.sha1) {