From ff0912e323e172e17205b02954f8c2737f151f1e Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sun, 7 Jul 2024 04:32:37 +1000 Subject: [PATCH] llama-gguf-hash: verification added --- examples/gguf-hash/README.md | 135 ++++++++ examples/gguf-hash/gguf-hash.cpp | 572 +++++++++++++++++++++++++------ 2 files changed, 599 insertions(+), 108 deletions(-) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index ce0d80943..2320ffd81 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -5,10 +5,15 @@ CLI to hash GGUF files to detect difference on a per model and per tensor level. **Command line options:** +- `--help`: display help message - `--xxh64`: use xhash 64bit hash mode (default) - `--sha1`: use sha1 - `--uuid`: use uuid - `--sha256`: use sha256 +- `--all`: use all hash +- `--no-layer`: exclude per layer hash +- `--uuid`: generate UUIDv5 ID +- `-c`, `--check `: verify against a manifest ## About @@ -62,6 +67,136 @@ make -C build llama-gguf-hash VERBOSE=1 ./build/bin/llama-gguf-hash --sha256 test.gguf ``` +## Generation and Verification Example + +To generate we may use this command + +```bash +./llama-gguf-hash --all test.gguf > test.gguf.manifest +``` + +Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well +(This excludes UUID as that is an ID not a hash) + +```bash +xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 +sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 +sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 +xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 +sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 +sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 +xxh64 a0af5d700049693b test.gguf:tensor_2 +sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 +sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 +xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 +sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 +sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 +xxh64 1257733306b7992d test.gguf:tensor_4 +sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 +sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 +xxh64 d238d16ba4711e58 test.gguf:tensor_5 +sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 +sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 +xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 +sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 +sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 +xxh64 c22021c29854f093 test.gguf:tensor_7 +sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 +sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 +xxh64 936df61f5d64261f test.gguf:tensor_8 +sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 +sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 +xxh64 93fd20c64421c081 test.gguf:tensor_9 +sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 +sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 +xxh64 5a54d3aad816f302 test.gguf +sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf +sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf +``` + +We can then use the normal check command which will by default check for the highest security strength hash and verify against that: + +```bash +$ ./llama-gguf-hash --check test.gguf.manifest test.gguf +manifest test.gguf.manifest sha256 sha1 xxh64 +sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok +sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok +sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok +sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok +sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok +sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok +sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok +sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok +sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok +sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok +sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok + +Verification results for test.gguf.manifest - Success +``` + +Or we may explicitly ask for a faster hash like: + +```bash +$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf +manifest test.gguf.manifest sha256 sha1 xxh64 +xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok +xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok +xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok +xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok +xxh64 1257733306b7992d test.gguf:tensor_4 - Ok +xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok +xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok +xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok +xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok +xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok +xxh64 5a54d3aad816f302 test.gguf - Ok + +Verification results for test.gguf.manifest - Success +``` + +Or maybe we want to just check that all the hash is valid: + +```bash +$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest +manifest test.gguf.manifest sha256 sha1 xxh64 +xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok +sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok +sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok +xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok +sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok +sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok +xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok +sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok +sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok +xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok +sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok +sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok +xxh64 1257733306b7992d test.gguf:tensor_4 - Ok +sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok +sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok +xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok +sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok +sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok +xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok +sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok +sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok +xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok +sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok +sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok +xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok +sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok +sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok +xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok +sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok +sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok +xxh64 5a54d3aad816f302 test.gguf - Ok +sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok +sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok + +Verification results for test.gguf.manifest - Success +``` + + ## Crypto/Hash Libraries Used These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs) diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index cfb0cc258..1447dddee 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -6,9 +6,10 @@ #include #include #include - #include +#include +#include #ifdef __cplusplus extern "C" { @@ -28,14 +29,70 @@ extern "C" { #define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5 +#define HASH_TYPE_SHA256_STR "sha256" +#define HASH_TYPE_SHA1_STR "sha1" +#define HASH_TYPE_XXH64_STR "xxh64" +#define HASH_TYPE_UUID_STR "uuid" + + +typedef enum { + HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated + HASH_EXIT_FAILURE = 1, // Generic Failure + HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation + HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest + HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it + HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format +} hash_exit_code_t; + + +typedef enum { + HASH_MANIFEST_NOT_FOUND, + HASH_MANIFEST_MISMATCH, + HASH_MANIFEST_OK, +} hash_manifest_result_t; + + struct hash_params { std::string input; bool xxh64 = false; bool sha1 = false; - bool uuid = false; bool sha256 = false; + bool uuid = false; + + bool no_layer = false; + + bool manifest_is_usable = false; + std::string manifest_file; }; +struct manifest_check_params { + bool xxh64 = false; + bool sha1 = false; + bool sha256 = false; + bool uuid = false; +}; + +static char const * hash_manifest_result_to_str(hash_manifest_result_t value) { + switch (value) { + case HASH_MANIFEST_NOT_FOUND: return "Not Found"; + case HASH_MANIFEST_MISMATCH: return "Mismatch"; + case HASH_MANIFEST_OK: return "Ok"; + } + return "?"; +} + +static char const * hash_exit_code_to_str(hash_exit_code_t value) { + switch (value) { + case HASH_EXIT_SUCCESS: return "Success"; + case HASH_EXIT_FAILURE: return "Failure"; + case HASH_EXIT_MISMATCH: return "Mismatch"; + case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry"; + case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash"; + case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error"; + } + return "?"; +} + static void hash_print_usage(const char * executable) { const hash_params default_params; printf("\n"); @@ -45,15 +102,19 @@ static void hash_print_usage(const char * executable) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --xxh64 use xxh64\n"); - printf(" --sha1 use sha1\n"); - printf(" --uuid use uuid\n"); - printf(" --sha256 use sha256\n"); + printf(" --xxh64 use xxh64 hash\n"); + printf(" --sha1 use sha1 hash\n"); + printf(" --sha256 use sha256 hash\n"); + printf(" --all use all hash\n"); + printf(" --no-layer exclude per layer hash\n"); + printf(" --uuid generate UUIDv5 ID\n"); + printf(" -c, --check verify against a manifest\n"); printf("\n"); } static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) { std::string arg; + bool invalid_param = false; const std::string arg_prefix = "--"; int arg_idx = 1; @@ -89,18 +150,34 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par params.sha256 = true; } + if (arg == "--all") { + arg_found = true; + params.sha256 = true; + params.sha1 = true; + params.xxh64 = true; + } + + if (arg == "--no-layer") { + arg_found = true; + params.no_layer = true; + } + + if (arg == "-c" || arg == "--check") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + params.manifest_file = argv[arg_idx]; + } + if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); } } - if (!params.xxh64 - && !params.sha1 - && !params.uuid - && !params.sha256 - ) { - // By default if no swich argument provided, assume xxh64 - params.xxh64 = true; + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument:" + arg); } if (argc - arg_idx < 1) { @@ -123,7 +200,89 @@ static bool hash_params_parse(int argc, const char ** argv, hash_params & params return result; } -static bool gguf_hash(const hash_params & hash_params) { +static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) { + if (manifest_file.empty()) { + return false; + } + + std::ifstream file(manifest_file); + if (!file.is_open()) { + return false; + } + + std::string manifest_entry_line; + while (getline(file, manifest_entry_line)) { + // hash_type_str hash_str tensor_name + // e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0' + std::istringstream line_stream(manifest_entry_line); + std::string file_hash_type; + if (line_stream >> file_hash_type) { + if (file_hash_type == HASH_TYPE_SHA256_STR) { + manifest_check.sha256 = true; + } else if (file_hash_type == HASH_TYPE_SHA1_STR) { + manifest_check.sha1 = true; + } else if (file_hash_type == HASH_TYPE_XXH64_STR) { + manifest_check.xxh64 = true; + } else if (file_hash_type == HASH_TYPE_UUID_STR) { + manifest_check.uuid = true; + } + } + } + + return true; +} + +static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) { + if (manifest_file.empty()) { + return HASH_MANIFEST_NOT_FOUND; + } + + std::ifstream file(manifest_file); + if (!file.is_open()) { + return HASH_MANIFEST_NOT_FOUND; + } + + std::string manifest_entry_line; + while (getline(file, manifest_entry_line)) { + std::istringstream line_stream(manifest_entry_line); + std::string file_hash_type; + std::string file_hash; + std::string file_tensor_name; + if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) { + // Line parsed. Check hash validity + + if (file_hash_type != hash_type_str) { + continue; + } + + if (file_tensor_name != tensor_name) { + continue; + } + + return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH; + } + } + + return HASH_MANIFEST_NOT_FOUND; +} + +static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) { + // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5 + // Assumes that digest was processed correctly with the expected namespace + for (int i = 0; i < 16; i++) { + uuid[i] = sha1_digest[i]; + } + + // Set bits corresponding to UUID ver 5 + uuid[ 6] &= ~(0xF << 4); + uuid[ 6] |= (5 << 4); + + // Set bits corresponding to UUID variant 0b10XX + uuid[ 8] &= ~(0xc << 4); + uuid[ 8] |= (0x8 << 4); +} + +static hash_exit_code_t gguf_hash(const hash_params & hash_params) { const std::string & fname = hash_params.input; struct ggml_context * ctx_data = NULL; @@ -158,64 +317,141 @@ static bool gguf_hash(const hash_params & hash_params) { sha256_init(&sha256_model_hash_ctx); } + // sha1 for uuid init + SHA1_CTX sha1_for_uuid_ctx; + if (hash_params.uuid) { + unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX}; + SHA1Init(&sha1_for_uuid_ctx); + SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); + } + struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); const int n_tensors = gguf_get_n_tensors(ctx); + bool tensor_layer_in_manifest = false; + bool model_in_manifest = false; + bool tensor_layer_has_mismatch = false; + bool model_has_mismatch = false; for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); auto n_bytes = ggml_nbytes(cur); auto *raw_data = cur->data; + const std::string tensor_layer_name = fname + ":" + name; if (hash_params.xxh64) { - // Per Layer Hash - XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0); + if (!hash_params.no_layer) { + // Per Layer Hash + XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0); - char hex_result[17]; - for (int offset = 0; offset < 8; offset++) { - unsigned int shift_bits_by = (8 * (8 - offset - 1)); - sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); + char hex_result[17]; + for (int offset = 0; offset < 8; offset++) { + unsigned int shift_bits_by = (8 * (8 - offset - 1)); + sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); + } + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + tensor_layer_in_manifest = true; + tensor_layer_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + tensor_layer_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str()); + } } - printf("xxh64 %s %s:%s\n", hex_result, fname.c_str(), name); - // Overall Model Hash if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); } if (hash_params.sha1) { - // Per Layer Hash - char result[21]; // sha1 outputs 20 bytes - SHA1( result, (const char *)raw_data, n_bytes); + if (!hash_params.no_layer) { + // Per Layer Hash + char result[21]; // sha1 outputs 20 bytes + SHA1( result, (const char *)raw_data, n_bytes); - char hex_result[41] = {0}; - for (int offset = 0; offset < 20; offset++) { - sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); + char hex_result[41] = {0}; + for (int offset = 0; offset < 20; offset++) { + sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); + } + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + tensor_layer_in_manifest = true; + tensor_layer_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + tensor_layer_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str()); + } } - printf("sha1 %s %s:%s\n", hex_result, fname.c_str(), name); - // Overall Model Hash SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); } if (hash_params.sha256) { - // Per Layer Hash - unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes - sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes); + if (!hash_params.no_layer) { + // Per Layer Hash + unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes + sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes); - char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0}; - for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) { - sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); + char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0}; + for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) { + sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); + } + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + tensor_layer_in_manifest = true; + tensor_layer_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + tensor_layer_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str()); + } } - printf("sha256 %s %s:%s\n", hex_result, fname.c_str(), name); - // Overall Model Hash sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); } + + if (hash_params.uuid) { + SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes); + } } if (hash_params.xxh64) { @@ -227,7 +463,25 @@ static bool gguf_hash(const hash_params & hash_params) { sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); } - printf("xxh64 %s %s\n", hex_result, fname.c_str()); + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + model_in_manifest = true; + model_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + model_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str()); + } } if (hash_params.sha1) { @@ -239,7 +493,25 @@ static bool gguf_hash(const hash_params & hash_params) { sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); } - printf("sha1 %s %s\n", hex_result, fname.c_str()); + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + model_in_manifest = true; + model_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + model_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str()); + } } if (hash_params.sha256) { @@ -251,87 +523,171 @@ static bool gguf_hash(const hash_params & hash_params) { sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff); } - printf("sha256 %s %s\n", hex_result, fname.c_str()); + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + model_in_manifest = true; + model_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + model_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str()); + } } + if (hash_params.uuid) { + unsigned char result[21]; + SHA1Final(result, &sha1_for_uuid_ctx); + + unsigned char uuid[16]; + generate_uuidv5(result, uuid); + + char string_buffer[37] = {0}; + sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + model_in_manifest = true; + model_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + model_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str()); + } + } + + ggml_free(ctx_data); gguf_free(ctx); - return true; -} -static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) { - // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5 - // Assumes that digest was processed correctly with the expected namespace - for (int i = 0; i < 16; i++) { - uuid[i] = sha1_digest[i]; + if (hash_params.manifest_is_usable) { + // In hash verification mode + + if (!model_in_manifest) { + // model missing in manifest? + + // Check tensor layer... + if (!tensor_layer_in_manifest) { + // Still missing? Maybe we are reading the wrong manifest. + return HASH_EXIT_MANIFEST_MISSING_ENTRY; + } + + if (tensor_layer_has_mismatch) { + // Per tensor check found error + return HASH_EXIT_FAILURE; + } + + // All per tensor layer checks passed? Sounds good enough. + return HASH_EXIT_SUCCESS; + } + + // Overall model check passed, but let's check per layer just in case + // If missing, we don't care too much as the overall model checked + if (tensor_layer_in_manifest && tensor_layer_has_mismatch) { + return HASH_EXIT_FAILURE; + } + + if (model_has_mismatch) { + // model has failed hash somewhere in the model + return HASH_EXIT_FAILURE; + } + + // All checks appears to be fine + return HASH_EXIT_SUCCESS; } - // Set bits corresponding to UUID ver 5 - uuid[ 6] &= ~(0xF << 4); - uuid[ 6] |= (5 << 4); - - // Set bits corresponding to UUID variant 0b10XX - uuid[ 8] &= ~(0xc << 4); - uuid[ 8] |= (0x8 << 4); -} - -static bool gguf_uuid(const hash_params & hash_params) { - if (!hash_params.uuid) { - return true; - } - - const std::string & fname = hash_params.input; - struct ggml_context * ctx_data = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, - }; - - // sha1 init - SHA1_CTX sha1_model_hash_ctx; - SHA1Init(&sha1_model_hash_ctx); - - unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX}; - SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); - - struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - const int n_tensors = gguf_get_n_tensors(ctx); - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - auto n_bytes = ggml_nbytes(cur); - auto *raw_data = cur->data; - SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); - } - - unsigned char result[21]; - SHA1Final(result, &sha1_model_hash_ctx); - - unsigned char uuid[16]; - generate_uuidv5(result, uuid); - - char string_buffer[37] = {0}; - sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - uuid[0], uuid[1], uuid[2], uuid[3], - uuid[4], uuid[5], uuid[6], uuid[7], - uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15]); - printf("UUIDv5 %s %s\n", string_buffer, fname.c_str()); - - ggml_free(ctx_data); - gguf_free(ctx); - - return true; + // In hash generation mode + return HASH_EXIT_SUCCESS; } int main(int argc, const char ** argv) { hash_params params; + manifest_check_params manifest_check; hash_params_parse(argc, argv, params); - gguf_hash(params); - gguf_uuid(params); + if (!params.manifest_file.empty()) { + if (!manifest_type(params.manifest_file, manifest_check)) { + printf("ERROR cannot open manifest %s", params.manifest_file.c_str()); + return HASH_EXIT_MANIFEST_FILE_ERROR; + } - return 0; + if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) { + printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str()); + return HASH_EXIT_MANIFEST_UNKNOWN_HASH; + } + + printf("manifest %s", params.manifest_file.c_str()); + + if (manifest_check.sha256) { + printf(" sha256"); + } + + if (manifest_check.sha1) { + printf(" sha1"); + } + + if (manifest_check.xxh64) { + printf(" xxh64"); + } + + if (manifest_check.uuid) { + printf(" uuid"); + } + + printf("\n"); + + // Autoselect the highest security hash if manifest is provided but + // the user has not specifically defined the hash they care about + if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + // User has not selected a specific value, pick most secure hash + if (manifest_check.sha256) { + params.sha256 = true; + } else if (manifest_check.sha1) { + params.sha1 = true; + } else if (manifest_check.xxh64) { + params.xxh64 = true; + } else if (manifest_check.uuid) { + params.uuid = true; + } + } + + params.manifest_is_usable = true; + } + + // By default if no swich argument provided, assume xxh64 + if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + params.xxh64 = true; + } + + hash_exit_code_t exit_code = gguf_hash(params); + + if (params.manifest_is_usable) { + printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code)); + } + + return exit_code; }