From 8fed75edca306996364aa85133a284df152d13bd Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sat, 6 Jul 2024 21:48:24 +1000 Subject: [PATCH] llama-gguf-hash: change argument from xxhash --> xxh64 and update readme --- examples/gguf-hash/README.md | 56 +++++++++++++++++++++---- examples/gguf-hash/deps/xxhash/xxhash.h | 2 +- examples/gguf-hash/gguf-hash.cpp | 38 ++++++++--------- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index 1b23e8dd3..ce0d80943 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -1,31 +1,71 @@ -## GGUF hash Example -CLI to hash GGUF files. +# llama-gguf-hash + +CLI to hash GGUF files to detect difference on a per model and per tensor level. **Command line options:** -- `--xxhash`: use xhash (default) +- `--xxh64`: use xhash 64bit hash mode (default) - `--sha1`: use sha1 - `--uuid`: use uuid - `--sha256`: use sha256 -### Compile Example +## About -``` +While most POSIX systems already have hash checking programs like sha256sum, it +is designed to check entire files. This is not ideal for our purpose if we want +to check for consistency of the tensor data even if the metadata content of the +gguf KV store has been updated. + +This program is designed to hash a gguf tensor payload on a 'per tensor layer' +in addition to a 'entire tensor model' hash. The intent is that the entire +tensor layer can be checked first but if there is any detected inconsistencies, +then the per tensor hash can be used to narrow down the specific tensor layer +that has inconsistencies. + +For Maintainers: +- Detection of tensor inconsistency during development and automated tests + - This is served by xxh64 which is fast + - This is also served by having per tensor layer to assist in narrowing down + the location of the faulty tensor layer + - This is also served by sha1 which is much slower but more widely supported + +For Model Creators: +- Optional consistent UUID generation based on model tensor content + - This is served by UUIDv5 which is useful for databases keys + - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5` + - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp` + +For Model Users: +- Assurance of tensor layer integrity even if metadata was updated + - This is served by sha256 which is still considered very secure as of 2024 + +### Design Note + +- The default behavior of this program if no arguments is provided is to hash + using xxhash's xxh32 mode because it is very fast and is primarily targeted + towards maintainers who may want to use this in automated tests. +- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively + however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus + would have a better affinity to calculating hash that is 64bit in size. + +## Compile Example + +```bash cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON make -C build clean make -C build llama-gguf-hash VERBOSE=1 ./build/bin/llama-gguf-hash test.gguf -./build/bin/llama-gguf-hash --xxhash test.gguf +./build/bin/llama-gguf-hash --xxh64 test.gguf ./build/bin/llama-gguf-hash --sha1 test.gguf ./build/bin/llama-gguf-hash --uuid test.gguf ./build/bin/llama-gguf-hash --sha256 test.gguf ``` -### Crypto/Hash Libraries Used +## Crypto/Hash Libraries Used These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs) -- https://github.com/mofosyne/xxHash +- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash) - https://github.com/clibs/sha1/ - https://github.com/jb55/sha256.c diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h index 599fea292..c0fafe20d 100644 --- a/examples/gguf-hash/deps/xxhash/xxhash.h +++ b/examples/gguf-hash/deps/xxhash/xxhash.h @@ -1687,7 +1687,7 @@ struct XXH64_state_s { #ifndef XXH_NO_XXH3 -/* Windows SDK under 10.0.22000 check is missing stdalign.h so we add a check +/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check before allowing the windows compiler to use the C11 form. Reference: https://github.com/Cyan4973/xxHash/issues/955 */ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \ diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index c5fa7af57..cfb0cc258 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -30,7 +30,7 @@ extern "C" { struct hash_params { std::string input; - bool xxhash = false; + bool xxh64 = false; bool sha1 = false; bool uuid = false; bool sha256 = false; @@ -45,7 +45,7 @@ static void hash_print_usage(const char * executable) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --xxhash use xxhash\n"); + printf(" --xxh64 use xxh64\n"); printf(" --sha1 use sha1\n"); printf(" --uuid use uuid\n"); printf(" --sha256 use sha256\n"); @@ -69,9 +69,9 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par exit(0); } - if (arg == "--xxhash") { + if (arg == "--xxh64") { arg_found = true; - params.xxhash = true; + params.xxh64 = true; } if (arg == "--sha1") { @@ -94,13 +94,13 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par } } - if (!params.xxhash + if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256 ) { - // By default if no swich argument provided, assume xxhash - params.xxhash = true; + // By default if no swich argument provided, assume xxh64 + params.xxh64 = true; } if (argc - arg_idx < 1) { @@ -132,16 +132,16 @@ static bool gguf_hash(const hash_params & hash_params) { /*.ctx = */ &ctx_data, }; - // xxhash init - XXH64_state_t* xxhash_model_hash_state = NULL; - if (hash_params.xxhash) { - xxhash_model_hash_state = XXH64_createState(); - if (xxhash_model_hash_state==NULL) { + // xxh64 init + XXH64_state_t* xxh64_model_hash_state = NULL; + if (hash_params.xxh64) { + xxh64_model_hash_state = XXH64_createState(); + if (xxh64_model_hash_state==NULL) { abort(); } XXH64_hash_t const seed = 0; - if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) { + if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) { abort(); } } @@ -166,7 +166,7 @@ static bool gguf_hash(const hash_params & hash_params) { auto n_bytes = ggml_nbytes(cur); auto *raw_data = cur->data; - if (hash_params.xxhash) { + if (hash_params.xxh64) { // Per Layer Hash XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0); @@ -177,10 +177,10 @@ static bool gguf_hash(const hash_params & hash_params) { sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); } - printf("xxhash %s %s:%s\n", hex_result, fname.c_str(), name); + printf("xxh64 %s %s:%s\n", hex_result, fname.c_str(), name); // Overall Model Hash - if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); + if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); } if (hash_params.sha1) { @@ -218,8 +218,8 @@ static bool gguf_hash(const hash_params & hash_params) { } } - if (hash_params.xxhash) { - XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state); + if (hash_params.xxh64) { + XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state); char hex_result[17]; for (int offset = 0; offset < 8; offset++) { @@ -227,7 +227,7 @@ static bool gguf_hash(const hash_params & hash_params) { sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); } - printf("xxhash %s %s\n", hex_result, fname.c_str()); + printf("xxh64 %s %s\n", hex_result, fname.c_str()); } if (hash_params.sha1) {