llama-gguf-hash: change argument from xxhash --> xxh64 and update readme

This commit is contained in:
brian khuu 2024-07-06 21:48:24 +10:00
parent 8f3749642e
commit 8fed75edca
3 changed files with 68 additions and 28 deletions

View file

@ -1,31 +1,71 @@
## GGUF hash Example
CLI to hash GGUF files.
# llama-gguf-hash
CLI to hash GGUF files to detect difference on a per model and per tensor level.
**Command line options:**
- `--xxhash`: use xhash (default)
- `--xxh64`: use xhash 64bit hash mode (default)
- `--sha1`: use sha1
- `--uuid`: use uuid
- `--sha256`: use sha256
### Compile Example
## About
```
While most POSIX systems already have hash checking programs like sha256sum, it
is designed to check entire files. This is not ideal for our purpose if we want
to check for consistency of the tensor data even if the metadata content of the
gguf KV store has been updated.
This program is designed to hash a gguf tensor payload on a 'per tensor layer'
in addition to a 'entire tensor model' hash. The intent is that the entire
tensor layer can be checked first but if there is any detected inconsistencies,
then the per tensor hash can be used to narrow down the specific tensor layer
that has inconsistencies.
For Maintainers:
- Detection of tensor inconsistency during development and automated tests
- This is served by xxh64 which is fast
- This is also served by having per tensor layer to assist in narrowing down
the location of the faulty tensor layer
- This is also served by sha1 which is much slower but more widely supported
For Model Creators:
- Optional consistent UUID generation based on model tensor content
- This is served by UUIDv5 which is useful for databases keys
- llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
- Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
For Model Users:
- Assurance of tensor layer integrity even if metadata was updated
- This is served by sha256 which is still considered very secure as of 2024
### Design Note
- The default behavior of this program if no arguments is provided is to hash
using xxhash's xxh32 mode because it is very fast and is primarily targeted
towards maintainers who may want to use this in automated tests.
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
would have a better affinity to calculating hash that is 64bit in size.
## Compile Example
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
make -C build clean
make -C build llama-gguf-hash VERBOSE=1
./build/bin/llama-gguf-hash test.gguf
./build/bin/llama-gguf-hash --xxhash test.gguf
./build/bin/llama-gguf-hash --xxh64 test.gguf
./build/bin/llama-gguf-hash --sha1 test.gguf
./build/bin/llama-gguf-hash --uuid test.gguf
./build/bin/llama-gguf-hash --sha256 test.gguf
```
### Crypto/Hash Libraries Used
## Crypto/Hash Libraries Used
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
- https://github.com/mofosyne/xxHash
- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c

View file

@ -1687,7 +1687,7 @@ struct XXH64_state_s {
#ifndef XXH_NO_XXH3
/* Windows SDK under 10.0.22000 check is missing stdalign.h so we add a check
/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check
before allowing the windows compiler to use the C11 form.
Reference: https://github.com/Cyan4973/xxHash/issues/955 */
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \

View file

@ -30,7 +30,7 @@ extern "C" {
struct hash_params {
std::string input;
bool xxhash = false;
bool xxh64 = false;
bool sha1 = false;
bool uuid = false;
bool sha256 = false;
@ -45,7 +45,7 @@ static void hash_print_usage(const char * executable) {
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --xxhash use xxhash\n");
printf(" --xxh64 use xxh64\n");
printf(" --sha1 use sha1\n");
printf(" --uuid use uuid\n");
printf(" --sha256 use sha256\n");
@ -69,9 +69,9 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
exit(0);
}
if (arg == "--xxhash") {
if (arg == "--xxh64") {
arg_found = true;
params.xxhash = true;
params.xxh64 = true;
}
if (arg == "--sha1") {
@ -94,13 +94,13 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
}
}
if (!params.xxhash
if (!params.xxh64
&& !params.sha1
&& !params.uuid
&& !params.sha256
) {
// By default if no swich argument provided, assume xxhash
params.xxhash = true;
// By default if no swich argument provided, assume xxh64
params.xxh64 = true;
}
if (argc - arg_idx < 1) {
@ -132,16 +132,16 @@ static bool gguf_hash(const hash_params & hash_params) {
/*.ctx = */ &ctx_data,
};
// xxhash init
XXH64_state_t* xxhash_model_hash_state = NULL;
if (hash_params.xxhash) {
xxhash_model_hash_state = XXH64_createState();
if (xxhash_model_hash_state==NULL) {
// xxh64 init
XXH64_state_t* xxh64_model_hash_state = NULL;
if (hash_params.xxh64) {
xxh64_model_hash_state = XXH64_createState();
if (xxh64_model_hash_state==NULL) {
abort();
}
XXH64_hash_t const seed = 0;
if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) {
if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
abort();
}
}
@ -166,7 +166,7 @@ static bool gguf_hash(const hash_params & hash_params) {
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
if (hash_params.xxhash) {
if (hash_params.xxh64) {
// Per Layer Hash
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
@ -177,10 +177,10 @@ static bool gguf_hash(const hash_params & hash_params) {
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
printf("xxhash %s %s:%s\n", hex_result, fname.c_str(), name);
printf("xxh64 %s %s:%s\n", hex_result, fname.c_str(), name);
// Overall Model Hash
if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
}
if (hash_params.sha1) {
@ -218,8 +218,8 @@ static bool gguf_hash(const hash_params & hash_params) {
}
}
if (hash_params.xxhash) {
XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state);
if (hash_params.xxh64) {
XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
@ -227,7 +227,7 @@ static bool gguf_hash(const hash_params & hash_params) {
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
printf("xxhash %s %s\n", hex_result, fname.c_str());
printf("xxh64 %s %s\n", hex_result, fname.c_str());
}
if (hash_params.sha1) {