llama-gguf-hash: change argument from xxhash --> xxh64 and update readme

2024-07-06 21:48:24 +10:00 · 2024-07-06 21:48:24 +10:00 · 8fed75edca
commit 8fed75edca
parent 8f3749642e
3 changed files with 68 additions and 28 deletions
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@ -1,31 +1,71 @@
-## GGUF hash Example

-CLI to hash GGUF files.
+# llama-gguf-hash
+
+CLI to hash GGUF files to detect difference on a per model and per tensor level.

 **Command line options:**

- `--xxhash`: use xhash (default)
+- `--xxh64`: use xhash 64bit hash mode (default)
 - `--sha1`: use sha1
 - `--uuid`: use uuid
 - `--sha256`: use sha256

-### Compile Example
+## About

-```
+While most POSIX systems already have hash checking programs like sha256sum, it
+is designed to check entire files. This is not ideal for our purpose if we want
+to check for consistency of the tensor data even if the metadata content of the
+gguf KV store has been updated.
+
+This program is designed to hash a gguf tensor payload on a 'per tensor layer'
+in addition to a 'entire tensor model' hash. The intent is that the entire
+tensor layer can be checked first but if there is any detected inconsistencies,
+then the per tensor hash can be used to narrow down the specific tensor layer
+that has inconsistencies.
+
+For Maintainers:
+- Detection of tensor inconsistency during development and automated tests
+    - This is served by xxh64 which is fast
+    - This is also served by having per tensor layer to assist in narrowing down
+      the location of the faulty tensor layer
+    - This is also served by sha1 which is much slower but more widely supported
+
+For Model Creators:
+- Optional consistent UUID generation based on model tensor content
+    - This is served by UUIDv5 which is useful for databases keys
+        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
+            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
+
+For Model Users:
+- Assurance of tensor layer integrity even if metadata was updated
+    - This is served by sha256 which is still considered very secure as of 2024
+
+### Design Note
+
+- The default behavior of this program if no arguments is provided is to hash
+  using xxhash's xxh32 mode because it is very fast and is primarily targeted
+  towards maintainers who may want to use this in automated tests.
+- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
+  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
+  would have a better affinity to calculating hash that is 64bit in size.
+
+## Compile Example
+
+```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
 make -C build clean
 make -C build llama-gguf-hash VERBOSE=1
 ./build/bin/llama-gguf-hash test.gguf
-./build/bin/llama-gguf-hash --xxhash test.gguf
+./build/bin/llama-gguf-hash --xxh64 test.gguf
 ./build/bin/llama-gguf-hash --sha1 test.gguf
 ./build/bin/llama-gguf-hash --uuid test.gguf
 ./build/bin/llama-gguf-hash --sha256 test.gguf
 ```

-### Crypto/Hash Libraries Used
+## Crypto/Hash Libraries Used

 These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)

- https://github.com/mofosyne/xxHash
+- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
 - https://github.com/clibs/sha1/
 - https://github.com/jb55/sha256.c
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@ -1687,7 +1687,7 @@ struct XXH64_state_s {

 #ifndef XXH_NO_XXH3

-/* Windows SDK under 10.0.22000 check is missing stdalign.h so we add a check
+/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check
   before allowing the windows compiler to use the C11 form.
   Reference: https://github.com/Cyan4973/xxHash/issues/955 */
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -30,7 +30,7 @@ extern "C" {

 struct hash_params {
    std::string input;
-    bool xxhash = false;
+    bool xxh64 = false;
    bool sha1 = false;
    bool uuid = false;
    bool sha256 = false;
@ -45,7 +45,7 @@ static void hash_print_usage(const char * executable) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help              show this help message and exit\n");
-    printf("      --xxhash            use xxhash\n");
+    printf("      --xxh64             use xxh64\n");
    printf("      --sha1              use sha1\n");
    printf("      --uuid              use uuid\n");
    printf("      --sha256            use sha256\n");
@ -69,9 +69,9 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
            exit(0);
        }

-        if (arg == "--xxhash") {
+        if (arg == "--xxh64") {
            arg_found = true;
-            params.xxhash = true;
+            params.xxh64 = true;
        }

        if (arg == "--sha1") {
@ -94,13 +94,13 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
        }
    }

-    if (!params.xxhash
+    if (!params.xxh64
            && !params.sha1
            && !params.uuid
            && !params.sha256
        ) {
-        // By default if no swich argument provided, assume xxhash
-        params.xxhash = true;
+        // By default if no swich argument provided, assume xxh64
+        params.xxh64 = true;
    }

    if (argc - arg_idx < 1) {
@ -132,16 +132,16 @@ static bool gguf_hash(const hash_params & hash_params) {
        /*.ctx      = */ &ctx_data,
    };

-    // xxhash init
-    XXH64_state_t* xxhash_model_hash_state = NULL;
-    if (hash_params.xxhash) {
-        xxhash_model_hash_state = XXH64_createState();
-        if (xxhash_model_hash_state==NULL) {
+    // xxh64 init
+    XXH64_state_t* xxh64_model_hash_state = NULL;
+    if (hash_params.xxh64) {
+        xxh64_model_hash_state = XXH64_createState();
+        if (xxh64_model_hash_state==NULL) {
            abort();
        }

        XXH64_hash_t const seed = 0;
-        if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) {
+        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
            abort();
        }
    }
@ -166,7 +166,7 @@ static bool gguf_hash(const hash_params & hash_params) {
        auto n_bytes = ggml_nbytes(cur);
        auto *raw_data = cur->data;

-        if (hash_params.xxhash) {
+        if (hash_params.xxh64) {

            // Per Layer Hash
            XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
@ -177,10 +177,10 @@ static bool gguf_hash(const hash_params & hash_params) {
                sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
            }

-            printf("xxhash  %s  %s:%s\n", hex_result, fname.c_str(), name);
+            printf("xxh64   %s  %s:%s\n", hex_result, fname.c_str(), name);

            // Overall Model Hash
-            if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
+            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
        }

        if (hash_params.sha1) {
@ -218,8 +218,8 @@ static bool gguf_hash(const hash_params & hash_params) {
        }
    }

-    if (hash_params.xxhash) {
-        XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state);
+    if (hash_params.xxh64) {
+        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);

        char hex_result[17];
        for (int  offset = 0; offset < 8; offset++) {
@ -227,7 +227,7 @@ static bool gguf_hash(const hash_params & hash_params) {
            sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
        }

-        printf("xxhash  %s  %s\n", hex_result, fname.c_str());
+        printf("xxh64   %s  %s\n", hex_result, fname.c_str());
    }

    if (hash_params.sha1) {